From 1ef609162089418cfa45f14e6a3173caeed022ee Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 12 Dec 2024 14:49:59 -0800
Subject: [PATCH 001/142] [r] add tf-idf and log normalization functions

---
 r/NAMESPACE                               |  2 +
 r/NEWS.md                                 |  1 +
 r/R/transforms.R                          | 54 +++++++++++++++++++++++
 r/man/normalize_log.Rd                    | 21 +++++++++
 r/man/normalize_tfidf.Rd                  | 21 +++++++++
 r/pkgdown/_pkgdown.yml                    |  2 +
 r/tests/testthat/test-matrix_transforms.R | 46 +++++++++++++++++++
 7 files changed, 147 insertions(+)
 create mode 100644 r/man/normalize_log.Rd
 create mode 100644 r/man/normalize_tfidf.Rd

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 8625c143..622d3a88 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -65,6 +65,8 @@ export(min_by_row)
 export(min_scalar)
 export(multiply_cols)
 export(multiply_rows)
+export(normalize_log)
+export(normalize_tfidf)
 export(nucleosome_counts)
 export(open_fragments_10x)
 export(open_fragments_dir)
diff --git a/r/NEWS.md b/r/NEWS.md
index 9715c95a..890f2755 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -22,6 +22,7 @@ Contributions welcome :)
 - Add `rowQuantiles()` and `colQuantiles()` functions, which return the quantiles of each row/column of a matrix. Currently `rowQuantiles()` only works on row-major matrices and `colQuantiles()` only works on col-major matrices.
   If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128)
 - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
+- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168)
 
 ## Improvements
 - `trackplot_loop()` now accepts discrete color scales
diff --git a/r/R/transforms.R b/r/R/transforms.R
index 2a5de994..7d1c46f8 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -923,3 +923,57 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
     vars_to_regress = vars_to_regress
   )
 }
+
+#################
+# Normalizations
+#################
+
+#' Normalize a matrix using log normalization
+#' @param mat (IterableMatrix) Matrix to normalize
+#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization
+#' @param add_one (logical) Add one to the matrix before log normalization
+#' @returns log normalized matrix.
+#' @export
+normalize_log <- function(mat, scale_factor = 1e4, add_one = TRUE) {
+  assert_is(mat, "IterableMatrix")
+  assert_is_numeric(scale_factor)
+  assert_true(is.logical(add_one))
+  assert_greater_than_zero(scale_factor)
+  mat <- mat * scale_factor
+  if (!add_one) mat <- mat - 1 
+  return(log1p(mat))
+}
+
+
+#' Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency
+#' @param mat (IterableMatrix) to normalize
+#' @param feature_means (numeric) Means of the features to normalize by.  If no names are provided, then 
+#' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
+#' Else, map each feature name to its mean value.
+#' @returns tf-idf normalized matrix.
+#' @export
+normalize_tfidf <- function(mat, feature_means = NULL, threads = 1L) {
+  assert_is(mat, "IterableMatrix")
+  assert_is_wholenumber(threads)
+  # If feature means are passed in, only need to calculate term frequency
+  if (is.null(feature_means)) {
+    mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
+    feature_means <- mat_stats$row_stats["mean", ]
+    read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
+  } else {
+    assert_is_numeric(feature_means)
+    if (!is.null(names(feature_means)) && !is.null(rownames(mat))) {
+      # Make sure every name in feature means exists in rownames(mat)
+      # In the case there is a length mismatch but the feature names all exist in feature_means
+      # will not error out
+      assert_true(all(rownames(mat) %in% names(feature_means)))
+      feature_means <- feature_means[rownames(mat)]
+    } else {
+      assert_len(feature_means, nrow(mat))
+    }
+    read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean",] * nrow(mat)
+  }
+  tf <- mat %>% multiply_cols(1 / read_depth)
+  idf <- 1 / feature_means
+  return(tf %>% multiply_rows(idf))
+}
\ No newline at end of file
diff --git a/r/man/normalize_log.Rd b/r/man/normalize_log.Rd
new file mode 100644
index 00000000..90a57f85
--- /dev/null
+++ b/r/man/normalize_log.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transforms.R
+\name{normalize_log}
+\alias{normalize_log}
+\title{Normalize a matrix using log normalization}
+\usage{
+normalize_log(mat, scale_factor = 10000, add_one = TRUE)
+}
+\arguments{
+\item{mat}{(IterableMatrix) Matrix to normalize}
+
+\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization}
+
+\item{add_one}{(logical) Add one to the matrix before log normalization}
+}
+\value{
+log normalized matrix.
+}
+\description{
+Normalize a matrix using log normalization
+}
diff --git a/r/man/normalize_tfidf.Rd b/r/man/normalize_tfidf.Rd
new file mode 100644
index 00000000..bf6a34ef
--- /dev/null
+++ b/r/man/normalize_tfidf.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transforms.R
+\name{normalize_tfidf}
+\alias{normalize_tfidf}
+\title{Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency}
+\usage{
+normalize_tfidf(mat, feature_means = NULL, threads = 1L)
+}
+\arguments{
+\item{mat}{(IterableMatrix) to normalize}
+
+\item{feature_means}{(numeric) Means of the features to normalize by.  If no names are provided, then
+each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
+Else, map each feature name to its mean value.}
+}
+\value{
+tf-idf normalized matrix.
+}
+\description{
+Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency
+}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index ea73ec01..22237598 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -126,6 +126,8 @@ reference:
   - checksum
   - apply_by_row
   - regress_out
+  - normalize_log
+  - normalize_tfidf
   - IterableMatrix-methods
   - pseudobulk_matrix
 
diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R
index 385605e0..b1941f8b 100644
--- a/r/tests/testthat/test-matrix_transforms.R
+++ b/r/tests/testthat/test-matrix_transforms.R
@@ -346,3 +346,49 @@ test_that("linear regression works", {
     expect_equal(as(m1, "matrix"), ans)
     expect_equal(as(m1t, "matrix"), ans)
 })
+
+test_that("tf-idf normalization works", {
+    m <- generate_sparse_matrix(5, 5)
+    rownames(m) <- paste0("row", seq_len(nrow(m)))
+    rev_rownames <- rev(rownames(m))
+    # Create tf-idf normalization for dgCMatrix
+    res_dgc <- diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix")
+    
+    rownames(res_dgc) <- rownames(m)
+    m2 <- as(m, "IterableMatrix")
+    # Check that we can pass in row means as a (named) vector
+    row_means <- matrix_stats(m2, row_stats = c("mean"))$row_stats["mean",]
+    # Test that row means ordering does not matter as long as names exist
+    row_means_shuffled <- row_means[sample(1:length(row_means))]
+    # Test that row means can have an extra element as long as all rownames are in the vector
+    row_means_plus_one <- c(row_means, row6 = 1)
+    
+
+    res <- normalize_tfidf(m2)
+    expect_equal(res %>% as("dgCMatrix"), res_dgc)
+    res_with_row_means <- normalize_tfidf(m2, feature_means = row_means)
+    expect_identical(res, res_with_row_means)
+
+    res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled)
+    expect_identical(res_with_row_means, res_with_shuffled_row_means, res)
+
+    res_with_row_means_with_extra_element <- normalize_tfidf(m2, feature_means = row_means_plus_one)
+    expect_identical(res, res_with_row_means_with_extra_element)
+})
+
+test_that("normalize_log works", {
+    m <- generate_sparse_matrix(5, 5)
+    m2 <- as(m, "IterableMatrix")
+    # Test that default params yield the same as log1p on dgCMatrix
+    res_1 <- as(normalize_log(m2), "dgCMatrix")
+    expect_equal(res_1, log1p(m*1e4))
+    
+    # Test that changing scale factor works
+    res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix")
+    expect_identical(res_2, log1p(m*1e5))
+    # Test that removing the add_one works
+    # log of 0 is -inf, but we don't do that on the c side, and just have really large negative numbers.
+    res_3 <- as(normalize_log(m2, add_one = FALSE), "dgCMatrix")
+    res_3@x[res_3@x < -700] <- -Inf
+    expect_identical(as(res_3, "dgeMatrix"), log(m*1e4))
+})
\ No newline at end of file

From 98675d0186f2fa7f016543a9f454c3578a5260ef Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 12 Dec 2024 15:13:11 -0800
Subject: [PATCH 002/142] [r] fix normalization tests

---
 r/tests/testthat/test-matrix_transforms.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R
index b1941f8b..06bc3337 100644
--- a/r/tests/testthat/test-matrix_transforms.R
+++ b/r/tests/testthat/test-matrix_transforms.R
@@ -381,14 +381,14 @@ test_that("normalize_log works", {
     m2 <- as(m, "IterableMatrix")
     # Test that default params yield the same as log1p on dgCMatrix
     res_1 <- as(normalize_log(m2), "dgCMatrix")
-    expect_equal(res_1, log1p(m*1e4))
+    expect_equal(res_1, log1p(m*1e4), tolerance = 1e-6)
     
     # Test that changing scale factor works
     res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix")
-    expect_identical(res_2, log1p(m*1e5))
+    expect_equal(res_2, log1p(m*1e5), tolerance = 1e-6)
     # Test that removing the add_one works
     # log of 0 is -inf, but we don't do that on the c side, and just have really large negative numbers.
     res_3 <- as(normalize_log(m2, add_one = FALSE), "dgCMatrix")
-    res_3@x[res_3@x < -700] <- -Inf
-    expect_identical(as(res_3, "dgeMatrix"), log(m*1e4))
+    res_3@x[res_3@x < -60] <- -Inf
+    expect_equal(as(res_3, "dgeMatrix"), log(m*1e4), tolerance = 1e-6)
 })
\ No newline at end of file

From 2f83ae647174ea718f0c05505487ab2ebb55393c Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 13 Dec 2024 17:09:56 -0800
Subject: [PATCH 003/142] [r] add in requested changes

---
 r/R/transforms.R                          | 38 ++++++++++++++---------
 r/man/normalize_log.Rd                    | 16 +++++-----
 r/man/normalize_tfidf.Rd                  | 16 +++++++---
 r/pkgdown/_pkgdown.yml                    |  5 +++
 r/tests/testthat/test-matrix_transforms.R | 12 +++----
 5 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/r/R/transforms.R b/r/R/transforms.R
index 7d1c46f8..b2dda267 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -928,31 +928,38 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 # Normalizations
 #################
 
-#' Normalize a matrix using log normalization
-#' @param mat (IterableMatrix) Matrix to normalize
-#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization
-#' @param add_one (logical) Add one to the matrix before log normalization
-#' @returns log normalized matrix.
+#' Normalize a `(features x cells)` matrix using log normalization.
+#' @param mat (IterableMatrix) Matrix to normalize.
+#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization.
+#' @param threads (integer) Number of threads to use.s
+#' @returns log normalized matrix.  For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, 
+#'  the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
+#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
 #' @export
-normalize_log <- function(mat, scale_factor = 1e4, add_one = TRUE) {
+normalize_log <- function(mat, scale_factor = 1e4, add_one = TRUE, threads = 1L) {
   assert_is(mat, "IterableMatrix")
   assert_is_numeric(scale_factor)
   assert_true(is.logical(add_one))
   assert_greater_than_zero(scale_factor)
-  mat <- mat * scale_factor
-  if (!add_one) mat <- mat - 1 
-  return(log1p(mat))
+  read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
+  mat <- mat %>% multiply_cols(1 / read_depth)
+  return(log1p(mat * scale_factor))
 }
 
 
-#' Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency
-#' @param mat (IterableMatrix) to normalize
+#' Normalize a `(features x cells)` matrix using term frequency-inverse document frequency.
 #' @param feature_means (numeric) Means of the features to normalize by.  If no names are provided, then 
 #' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
 #' Else, map each feature name to its mean value.
-#' @returns tf-idf normalized matrix.
+#' @returns tf-idf normalized matrix.  For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
+#' the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
+#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
+#' @inheritParams normalize_log
 #' @export
-normalize_tfidf <- function(mat, feature_means = NULL, threads = 1L) {
+normalize_tfidf <- function(
+  mat, feature_means = NULL,
+  scale_factor = 1e4, threads = 1L
+) {
   assert_is(mat, "IterableMatrix")
   assert_is_wholenumber(threads)
   # If feature means are passed in, only need to calculate term frequency
@@ -971,9 +978,10 @@ normalize_tfidf <- function(mat, feature_means = NULL, threads = 1L) {
     } else {
       assert_len(feature_means, nrow(mat))
     }
-    read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean",] * nrow(mat)
+    read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
   }
   tf <- mat %>% multiply_cols(1 / read_depth)
   idf <- 1 / feature_means
-  return(tf %>% multiply_rows(idf))
+  tf_idf_mat <- tf %>% multiply_rows(idf)
+  return(log1p(tf_idf_mat * scale_factor))
 }
\ No newline at end of file
diff --git a/r/man/normalize_log.Rd b/r/man/normalize_log.Rd
index 90a57f85..9de93071 100644
--- a/r/man/normalize_log.Rd
+++ b/r/man/normalize_log.Rd
@@ -2,20 +2,22 @@
 % Please edit documentation in R/transforms.R
 \name{normalize_log}
 \alias{normalize_log}
-\title{Normalize a matrix using log normalization}
+\title{Normalize a \verb{(features x cells)} matrix using log normalization.}
 \usage{
-normalize_log(mat, scale_factor = 10000, add_one = TRUE)
+normalize_log(mat, scale_factor = 10000, add_one = TRUE, threads = 1L)
 }
 \arguments{
-\item{mat}{(IterableMatrix) Matrix to normalize}
+\item{mat}{(IterableMatrix) Matrix to normalize.}
 
-\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization}
+\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.}
 
-\item{add_one}{(logical) Add one to the matrix before log normalization}
+\item{threads}{(integer) Number of threads to use.s}
 }
 \value{
-log normalized matrix.
+log normalized matrix.  For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
+the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
+\eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
 }
 \description{
-Normalize a matrix using log normalization
+Normalize a \verb{(features x cells)} matrix using log normalization.
 }
diff --git a/r/man/normalize_tfidf.Rd b/r/man/normalize_tfidf.Rd
index bf6a34ef..8dc50b84 100644
--- a/r/man/normalize_tfidf.Rd
+++ b/r/man/normalize_tfidf.Rd
@@ -2,20 +2,26 @@
 % Please edit documentation in R/transforms.R
 \name{normalize_tfidf}
 \alias{normalize_tfidf}
-\title{Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency}
+\title{Normalize a \verb{(features x cells)} matrix using term frequency-inverse document frequency.}
 \usage{
-normalize_tfidf(mat, feature_means = NULL, threads = 1L)
+normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L)
 }
 \arguments{
-\item{mat}{(IterableMatrix) to normalize}
+\item{mat}{(IterableMatrix) Matrix to normalize.}
 
 \item{feature_means}{(numeric) Means of the features to normalize by.  If no names are provided, then
 each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
 Else, map each feature name to its mean value.}
+
+\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.}
+
+\item{threads}{(integer) Number of threads to use.s}
 }
 \value{
-tf-idf normalized matrix.
+tf-idf normalized matrix.  For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
+the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
+\eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
 }
 \description{
-Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency
+Normalize a \verb{(features x cells)} matrix using term frequency-inverse document frequency.
 }
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 22237598..ae2772eb 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -9,6 +9,11 @@ template:
   includes:
     in_header: |
        <script defer data-domain="benparks.net" src="https://plausible.benparks.net/js/visit-counts.js"></script>
+       <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.15/dist/katex.min.css" integrity="sha384-Htz9HMhiwV8GuQ28Xr9pEs1B4qJiYu/nYLLwlDklR53QibDfmQzi7rYxXhMH/5/u" crossorigin="anonymous">
+       <!-- The loading of KaTeX is deferred to speed up page rendering -->
+       <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.15/dist/katex.min.js" integrity="sha384-bxmi2jLGCvnsEqMuYLKE/KsVCxV3PqmKeK6Y6+lmNXBry6+luFkEOsmp5vD9I/7+" crossorigin="anonymous"></script>
+       <!-- To automatically render math in text elements, include the auto-render extension: -->
+       <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.15/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous" onload="renderMathInElement(document.body);"></script>
     after_body: |
        <img src="https://plausible.benparks.net/flask-plausible/bpcells-docs.png" style="position:absolute;" />
 
diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R
index 06bc3337..c4c5edb5 100644
--- a/r/tests/testthat/test-matrix_transforms.R
+++ b/r/tests/testthat/test-matrix_transforms.R
@@ -352,7 +352,7 @@ test_that("tf-idf normalization works", {
     rownames(m) <- paste0("row", seq_len(nrow(m)))
     rev_rownames <- rev(rownames(m))
     # Create tf-idf normalization for dgCMatrix
-    res_dgc <- diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix")
+    res_dgc <- log1p((diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix")) * 1e4)
     
     rownames(res_dgc) <- rownames(m)
     m2 <- as(m, "IterableMatrix")
@@ -378,17 +378,13 @@ test_that("tf-idf normalization works", {
 
 test_that("normalize_log works", {
     m <- generate_sparse_matrix(5, 5)
+    res_dgc <- m %*% diag(1/colSums(m)) %>% as("dgCMatrix")
     m2 <- as(m, "IterableMatrix")
     # Test that default params yield the same as log1p on dgCMatrix
     res_1 <- as(normalize_log(m2), "dgCMatrix")
-    expect_equal(res_1, log1p(m*1e4), tolerance = 1e-6)
+    expect_equal(res_1, log1p(res_dgc*1e4), tolerance = 1e-6)
     
     # Test that changing scale factor works
     res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix")
-    expect_equal(res_2, log1p(m*1e5), tolerance = 1e-6)
-    # Test that removing the add_one works
-    # log of 0 is -inf, but we don't do that on the c side, and just have really large negative numbers.
-    res_3 <- as(normalize_log(m2, add_one = FALSE), "dgCMatrix")
-    res_3@x[res_3@x < -60] <- -Inf
-    expect_equal(as(res_3, "dgeMatrix"), log(m*1e4), tolerance = 1e-6)
+    expect_equal(res_2, log1p(res_dgc*1e5), tolerance = 1e-6)
 })
\ No newline at end of file

From 6381f74d02982c7972198aa32a042a99b0808069 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 13 Dec 2024 17:15:56 -0800
Subject: [PATCH 004/142] [r] removed unused variable

---
 r/R/transforms.R                          | 3 +--
 r/man/normalize_log.Rd                    | 2 +-
 r/tests/testthat/test-matrix_transforms.R | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/r/R/transforms.R b/r/R/transforms.R
index b2dda267..8a2bd25b 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -936,10 +936,9 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #'  the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
 #' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
 #' @export
-normalize_log <- function(mat, scale_factor = 1e4, add_one = TRUE, threads = 1L) {
+normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
   assert_is(mat, "IterableMatrix")
   assert_is_numeric(scale_factor)
-  assert_true(is.logical(add_one))
   assert_greater_than_zero(scale_factor)
   read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
   mat <- mat %>% multiply_cols(1 / read_depth)
diff --git a/r/man/normalize_log.Rd b/r/man/normalize_log.Rd
index 9de93071..97d8c92e 100644
--- a/r/man/normalize_log.Rd
+++ b/r/man/normalize_log.Rd
@@ -4,7 +4,7 @@
 \alias{normalize_log}
 \title{Normalize a \verb{(features x cells)} matrix using log normalization.}
 \usage{
-normalize_log(mat, scale_factor = 10000, add_one = TRUE, threads = 1L)
+normalize_log(mat, scale_factor = 10000, threads = 1L)
 }
 \arguments{
 \item{mat}{(IterableMatrix) Matrix to normalize.}
diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R
index c4c5edb5..67641e54 100644
--- a/r/tests/testthat/test-matrix_transforms.R
+++ b/r/tests/testthat/test-matrix_transforms.R
@@ -365,7 +365,7 @@ test_that("tf-idf normalization works", {
     
 
     res <- normalize_tfidf(m2)
-    expect_equal(res %>% as("dgCMatrix"), res_dgc)
+    expect_equal(res %>% as("dgCMatrix"), res_dgc, tolerance = 1e-6)
     res_with_row_means <- normalize_tfidf(m2, feature_means = row_means)
     expect_identical(res, res_with_row_means)
 

From 8e80dc50b52e15b4fa12adfcd62e785a8a7b75f1 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Sat, 14 Dec 2024 19:22:16 -0800
Subject: [PATCH 005/142] [r] add feature selection methods

---
 r/NAMESPACE                              |   2 +
 r/NEWS.md                                |   1 +
 r/R/singlecell_utils.R                   | 113 +++++++++++++++++++++++
 r/man/select_features_by_dispersion.Rd   |  42 +++++++++
 r/man/select_features_by_mean.Rd         |  34 +++++++
 r/man/select_features_by_variance.Rd     |  41 ++++++++
 r/pkgdown/_pkgdown.yml                   |   3 +
 r/tests/testthat/test-singlecell_utils.R |  21 ++++-
 8 files changed, 256 insertions(+), 1 deletion(-)
 create mode 100644 r/man/select_features_by_dispersion.Rd
 create mode 100644 r/man/select_features_by_mean.Rd
 create mode 100644 r/man/select_features_by_variance.Rd

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 622d3a88..c90e1a15 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -110,6 +110,8 @@ export(rowVars.default)
 export(sctransform_pearson)
 export(select_cells)
 export(select_chromosomes)
+export(select_features_by_mean)
+export(select_features_by_variance)
 export(select_regions)
 export(set_trackplot_height)
 export(set_trackplot_label)
diff --git a/r/NEWS.md b/r/NEWS.md
index 890f2755..73ca847d 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -23,6 +23,7 @@ Contributions welcome :)
   If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128)
 - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
 - Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168)
+- Add feature selection functions `select_features_by_{variance,dispersion,mean}()`, with parameterization for normalization steps, and number of variable features (pull request #169)
 
 ## Improvements
 - `trackplot_loop()` now accepts discrete color scales
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 0fa1711d..cd7bc554 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -6,6 +6,118 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
+
+#################
+# Feature selection
+#################
+
+#' Get the most variable features within a matrix.
+#' @param mat (IterableMatrix) dimensions features x cells
+#' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
+#' all features will be returned.
+#' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed.
+#' @param threads (integer) Number of threads to use.
+#' @returns
+#' Return a dataframe with the following columns, sorted descending by variance:
+#' - `names`: Feature name.
+#' - `score`: Variance of the feature.
+#' - `highly_variable`: Logical vector of whether the feature is highly variable.
+#' @details 
+#' Calculate using the following process:
+#'  1. Perform an optional term frequency + log normalization, for each feature.
+#'  2. Find `num_feats` features with the highest variance across clusters.
+#' @export
+select_features_by_variance <- function(
+  mat, num_feats = 25000, 
+  normalize = normalize_log,
+  threads = 1L
+) {
+  assert_is(mat, "IterableMatrix")
+  assert_greater_than_zero(num_feats)
+  assert_is_wholenumber(num_feats)
+  assert_len(num_feats, 1)
+  assert_is(num_feats, "numeric")
+  num_feats <- min(max(num_feats, 0), nrow(mat))
+
+  if (!is.null(normalize)) mat <- normalize(mat, threads = threads)
+  features_df <- tibble::tibble(
+    names = rownames(mat),
+    score = matrix_stats(mat, row_stats = "variance", threads = threads)$row_stats["variance",]
+  ) %>% 
+    dplyr::arrange(desc(score)) %>% 
+    dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats)
+  return(features_df)
+}
+
+
+#' Get the features with the highest dispersion within a matrix.
+#' @returns
+#' Return a dataframe with the following columns, sorted descending by dispersion:
+#'   - `names`: Feature name.
+#'   - `score`: Variance of the feature.
+#'   - `highly_variable`: Logical vector of whether the feature is highly variable.
+#' @inheritParams select_features_by_variance
+#' @details 
+#' Calculate using the following process:
+#'  1. Perform an optional term frequency + log normalization, for each feature.
+#'  2. Find the dispersion (variance/mean) of each feature.
+#'  3. Find `num_feats` features with the highest dispersion.
+select_features_by_dispersion <- function(
+  mat, num_feats = 25000, 
+  normalize = normalize_log,
+  threads = 1L
+) {
+  assert_is(mat, "IterableMatrix")
+  assert_greater_than_zero(num_feats)
+  assert_is_wholenumber(num_feats)
+  assert_len(num_feats, 1)
+  assert_is(num_feats, "numeric")
+  num_feats <- min(max(num_feats, 0), nrow(mat))
+
+  if (!is.null(normalize)) mat <- normalize(mat, threads = threads)
+  mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads)
+  features_df <- tibble::tibble(
+    names = rownames(mat),
+    score = mat_stats$row_stats["variance", ] / mat_stats$row_stats["mean", ]
+  ) %>% 
+    dplyr::arrange(desc(score)) %>% 
+    dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats)
+  return(features_df)
+}
+
+
+#' Get the top features from a matrix, based on the mean accessibility of each feature.
+#' @param num_feats Number of features to deem as highly accessible.  If the number is higher than the number of features in the matrix,
+#' all features will be returned.
+#' @inheritParams select_features_by_variance
+#' @returns
+#' Return a dataframe with the following columns, sorted descending by mean accessibility:
+#'   - `names`: Feature name.
+#'   - `score`: Binarize sum of each feature.
+#'   - `highly_variable`: Logical vector of whether the feature is highly accessible by mean accessibility.
+#' @details 
+#' Calculate using the following process:
+#' 1. Get the sum of each binarized feature.
+#' 2. Find `num_feats` features with the highest accessibility.
+#' @export
+select_features_by_mean <- function(mat, num_feats = 25000, threads = 1L) {
+  assert_is(mat, "IterableMatrix")
+  assert_is_wholenumber(num_feats)
+  assert_greater_than_zero(num_feats)
+  assert_is(num_feats, "numeric")
+  num_feats <- min(max(num_feats, 0), nrow(mat))
+  # get the sum of each feature, binarized
+  # get the top features
+  features_df <- tibble::tibble(
+    names = rownames(mat),
+    score = matrix_stats(mat, row_stats = "nonzero", threads = threads)$row_stats["nonzero", ]
+  ) %>%
+    dplyr::arrange(desc(score)) %>%
+    dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats)
+  return(features_df)
+}
+
+
 #' Test for marker features
 #'
 #' Given a features x cells matrix, perform one-vs-all differential
@@ -70,6 +182,7 @@ marker_features <- function(mat, groups, method="wilcoxon") {
     )
 }
 
+
 #' Aggregate counts matrices by cell group or feature.
 #'
 #' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each
diff --git a/r/man/select_features_by_dispersion.Rd b/r/man/select_features_by_dispersion.Rd
new file mode 100644
index 00000000..5d0bc177
--- /dev/null
+++ b/r/man/select_features_by_dispersion.Rd
@@ -0,0 +1,42 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{select_features_by_dispersion}
+\alias{select_features_by_dispersion}
+\title{Get the features with the highest dispersion within a matrix.}
+\usage{
+select_features_by_dispersion(
+  mat,
+  num_feats = 25000,
+  normalize = normalize_log,
+  threads = 1L
+)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
+all features will be returned.}
+
+\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+Return a dataframe with the following columns, sorted descending by dispersion:
+\itemize{
+\item \code{names}: Feature name.
+\item \code{score}: Variance of the feature.
+\item \code{highly_variable}: Logical vector of whether the feature is highly variable.
+}
+}
+\description{
+Get the features with the highest dispersion within a matrix.
+}
+\details{
+Calculate using the following process:
+\enumerate{
+\item Perform an optional term frequency + log normalization, for each feature.
+\item Find the dispersion (variance/mean) of each feature.
+\item Find \code{num_feats} features with the highest dispersion.
+}
+}
diff --git a/r/man/select_features_by_mean.Rd b/r/man/select_features_by_mean.Rd
new file mode 100644
index 00000000..c05b0acb
--- /dev/null
+++ b/r/man/select_features_by_mean.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{select_features_by_mean}
+\alias{select_features_by_mean}
+\title{Get the top features from a matrix, based on the mean accessibility of each feature.}
+\usage{
+select_features_by_mean(mat, num_feats = 25000, threads = 1L)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{num_feats}{Number of features to deem as highly accessible.  If the number is higher than the number of features in the matrix,
+all features will be returned.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+Return a dataframe with the following columns, sorted descending by mean accessibility:
+\itemize{
+\item \code{names}: Feature name.
+\item \code{score}: Binarize sum of each feature.
+\item \code{highly_variable}: Logical vector of whether the feature is highly accessible by mean accessibility.
+}
+}
+\description{
+Get the top features from a matrix, based on the mean accessibility of each feature.
+}
+\details{
+Calculate using the following process:
+\enumerate{
+\item Get the sum of each binarized feature.
+\item Find \code{num_feats} features with the highest accessibility.
+}
+}
diff --git a/r/man/select_features_by_variance.Rd b/r/man/select_features_by_variance.Rd
new file mode 100644
index 00000000..b7cc375f
--- /dev/null
+++ b/r/man/select_features_by_variance.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{select_features_by_variance}
+\alias{select_features_by_variance}
+\title{Get the most variable features within a matrix.}
+\usage{
+select_features_by_variance(
+  mat,
+  num_feats = 25000,
+  normalize = normalize_log,
+  threads = 1L
+)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
+all features will be returned.}
+
+\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+Return a dataframe with the following columns, sorted descending by variance:
+\itemize{
+\item \code{names}: Feature name.
+\item \code{score}: Variance of the feature.
+\item \code{highly_variable}: Logical vector of whether the feature is highly variable.
+}
+}
+\description{
+Get the most variable features within a matrix.
+}
+\details{
+Calculate using the following process:
+\enumerate{
+\item Perform an optional term frequency + log normalization, for each feature.
+\item Find \code{num_feats} features with the highest variance across clusters.
+}
+}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index ae2772eb..526018e4 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -133,6 +133,9 @@ reference:
   - regress_out
   - normalize_log
   - normalize_tfidf
+  - select_features_by_variance
+  - select_features_by_dispersion
+  - select_features_by_mean
   - IterableMatrix-methods
   - pseudobulk_matrix
 
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 8bae8966..8117ed02 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -13,6 +13,24 @@ generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val =
   as(m, "dgCMatrix")
 }
 
+test_that("select_features works general case", {
+  m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix")
+  for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) {
+    res <- do.call(fn, list(m1, num_feats = 10))
+    expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting
+    expect_equal(sum(res$highly_variable), 10) # Only 10 features marked as highly variable
+    expect_setequal(res$names, rownames(m1))
+    res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows
+    res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) 
+    expect_identical(res_more_feats_than_rows, res_feats_equal_rows)
+    if (fn != "select_features_by_mean") {
+      # Check that normalization actually does something
+      res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL))
+      expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score))
+    }
+  }
+})
+
 test_that("Wilcoxon rank sum works (small)", {
     x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46)
     y <- c(1.15, 0.88, 0.90, 0.74, 1.21)
@@ -160,4 +178,5 @@ test_that("Pseudobulk aggregation works with multiple return types", {
       }
     }
   }
-})
\ No newline at end of file
+})
+

From c50ead2da9d33b375f98c4599ea4152d55a9b8be Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 10 Jan 2025 02:48:40 -0800
Subject: [PATCH 006/142] [r] update select_features_by_dispersion() to reflect
 archr defaults

---
 r/R/singlecell_utils.R                   | 2 +-
 r/man/select_features_by_dispersion.Rd   | 2 +-
 r/tests/testthat/test-singlecell_utils.R | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index cd7bc554..32e45934 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -64,7 +64,7 @@ select_features_by_variance <- function(
 #'  3. Find `num_feats` features with the highest dispersion.
 select_features_by_dispersion <- function(
   mat, num_feats = 25000, 
-  normalize = normalize_log,
+  normalize = NULL,
   threads = 1L
 ) {
   assert_is(mat, "IterableMatrix")
diff --git a/r/man/select_features_by_dispersion.Rd b/r/man/select_features_by_dispersion.Rd
index 5d0bc177..2835c9a8 100644
--- a/r/man/select_features_by_dispersion.Rd
+++ b/r/man/select_features_by_dispersion.Rd
@@ -7,7 +7,7 @@
 select_features_by_dispersion(
   mat,
   num_feats = 25000,
-  normalize = normalize_log,
+  normalize = NULL,
   threads = 1L
 )
 }
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 8117ed02..111324b7 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -23,7 +23,7 @@ test_that("select_features works general case", {
     res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows
     res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) 
     expect_identical(res_more_feats_than_rows, res_feats_equal_rows)
-    if (fn != "select_features_by_mean") {
+    if (fn == "select_features_by_variance") {
       # Check that normalization actually does something
       res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL))
       expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score))

From 13c37606f1952f14ea86b22a0399981118eab459 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 30 Oct 2024 19:00:33 -0700
Subject: [PATCH 007/142] [r] add lsi, var feature selection

---
 r/R/transforms.R | 72 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/r/R/transforms.R b/r/R/transforms.R
index 8a2bd25b..7a110677 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -983,4 +983,76 @@ normalize_tfidf <- function(
   idf <- 1 / feature_means
   tf_idf_mat <- tf %>% multiply_rows(idf)
   return(log1p(tf_idf_mat * scale_factor))
+}
+#' Compute LSI For a peak matrix 
+#' @param mat PeakMatrix
+#' @param n_dimensions Number of dimensions to keep during PCA
+#' @param scale_factor Scale factor for the tf-idf log transform
+#' @param verbose Whether to print out progress
+#' @param threads Number of threads to use
+#' @return dgCMatrix of shape (n_dimensions, ncol(mat))
+#' @details Compute LSI through first doing a tf-idf transform, a z-score normalization, then PCA.  
+#' Tf-idf implementation is from Stuart & Butler et al. 2019.  
+#' @export
+compute_lsi <- function(mat, n_dimensions = 50, scale_factor = 1e-4, verbose = FALSE, threads = 1) {
+  assert_is(mat, "IterableMatrix") # Should be a peak matrix, should we enforce this?
+  assert_is(n_dimensions, "integer")
+  assert_len(n_dimensions, 1)
+  assert_greater_than_zero(n_dimensions)
+  assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
+
+  # Signac implementation
+  npeaks <- colSums(mat)
+  tf <- mat %>% multiply_cols(1/npeaks)
+  idf_ <- ncol(mat) / rowSums(mat)
+  mat_tfidf <- tf %>% multiply_rows(idf_)
+  mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
+
+  # run z-score norm
+  cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance")$col_stats
+  cell_means <- cell_peak_stats["mean",]
+  cell_vars <- cell_peak_stats["variance",]
+  mat_lsi_norm <- mat_log_tfidf %>%
+    add_cols(-cell_means) %>%
+    multiply_cols(1 / cell_vars)
+
+  # Run pca
+  svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions)
+  pca_res <- t(svd_attr$u) %*% mat_lsi_norm
+  return(pca_res)
+}
+
+#' Get most variable features, given a non-log normalized matrix
+highly_variable_features <- function(mat, num_feats, n_bins, verbose = FALSE) {
+  assert_is(mat, "IterableMatrix")
+  assert_is(num_feats, "integer")
+  assert_greater_than_zero(num_feats)
+  assert_len(num_feats, 1)
+  assert_is(n_bins, "integer")
+  assert_len(n_bins, 1)
+  assert_greater_than_zero(n_bins)
+  if (nrow(mat) <= num_feats) {
+    if (verbose) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), 
+                                       returning all features", nrow(mat), num_feats))
+    return(mat)
+  }
+  # Calculate the mean and variance of each feature
+  # should we set entries that are 0 to NA?
+  feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats['mean', ]
+  feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats['variance', ]
+  feature_dispersion <- feature_vars / feature_means
+  feature_dispersion <- log(feature_dispersion)
+  feature_means <- log1p(feature_means)
+  mean_bins <- cut(feature_means, n_bins, labels = FALSE)
+
+  # Calculate the mean and variance of dispersion of each bin
+  bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x))
+  bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x))
+  # Set bin_sd value to bin_mean if bin_sd is NA, results in norm dispersion of 1
+  bin_sd[is.na(bin_sd)] <- bin_mean[is.na(bin_sd)]
+  # map mean_bins indices to bin_stats
+  feature_dispersion_norm <- (feature_dispersion - bin_mean[mean_bins]) / bin_sd[mean_bins]
+  names(feature_dispersion_norm) <- names(feature_dispersion)
+  variable_features_ <- sort(feature_dispersion_norm)[nrow(mat)-num_feats:nrow(mat)]
+  return(mat[names(variable_features_), ])
 }
\ No newline at end of file

From 36a8983a53de0bbe558743d9ca862413882b6d91 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 4 Nov 2024 15:14:26 -0800
Subject: [PATCH 008/142] [r] add lsi, variable feature selection

---
 r/NAMESPACE                              |   2 +
 r/NEWS.md                                |   3 +
 r/R/singlecell_utils.R                   | 106 +++++++++++++++++++++++
 r/man/highly_variable_features.Rd        |  36 ++++++++
 r/man/lsi.Rd                             |  46 ++++++++++
 r/pkgdown/_pkgdown.yml                   |   2 +
 r/tests/testthat/test-singlecell_utils.R |  49 +++++++++++
 7 files changed, 244 insertions(+)
 create mode 100644 r/man/highly_variable_features.Rd
 create mode 100644 r/man/lsi.Rd

diff --git a/r/NAMESPACE b/r/NAMESPACE
index c90e1a15..c7e4fd1c 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -48,6 +48,7 @@ export(gene_score_archr)
 export(gene_score_tiles_archr)
 export(gene_score_weights_archr)
 export(get_trackplot_height)
+export(highly_variable_features)
 export(import_matrix_market)
 export(import_matrix_market_10x)
 export(knn_annoy)
@@ -55,6 +56,7 @@ export(knn_hnsw)
 export(knn_to_geodesic_graph)
 export(knn_to_snn_graph)
 export(log1p_slow)
+export(lsi)
 export(marker_features)
 export(match_gene_symbol)
 export(matrix_stats)
diff --git a/r/NEWS.md b/r/NEWS.md
index 73ca847d..24b6b0d9 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -24,6 +24,9 @@ Contributions welcome :)
 - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
 - Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168)
 - Add feature selection functions `select_features_by_{variance,dispersion,mean}()`, with parameterization for normalization steps, and number of variable features (pull request #169)
+- Add MACS2/3 input creation and peak calling through `call_macs_peaks()` (pull request #118)
+- Add `lsi()` function to perform latent semantic indexing on a matrix (pull request #156).
+- Add `highly_variable_features()` function to identify highly variable features in a matrix (pull request #156). 
 
 ## Improvements
 - `trackplot_loop()` now accepts discrete color scales
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 32e45934..c4d7fbf3 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -233,4 +233,110 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
     }
   }
   return(res)
+}
+
+#' Perform latent semantic indexing (LSI) on a matrix.
+#' @param mat (IterableMatrix) dimensions features x cells
+#' @param n_dimensions (integer) Number of dimensions to keep during PCA.
+#' @param scale_factor (integer) Scale factor for the tf-idf log transform.
+#' @param save_in_memory (logical) If TRUE, save the log(tf-idf) matrix in memory.  
+#' If FALSE, save to a temporary location in disk.  Saving in memory will result in faster downstream operations,
+#' but will require in higher memory usage.  Comparison of memory usage and speed is in the details section.
+#' @param threads (integer) Number of threads to use.
+#' @return dgCMatrix of shape (n_dimensions, ncol(mat)).
+#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
+#' 
+#' ** Saving in memory vs disk: **
+#' Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix.
+#' This is done to prevent re-calculation of queued operations during PCA optimization.
+#' 
+#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
+#' - Saving in memory: 233 MB memory usage, 22.7 seconds runtime
+#' - Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime
+#' 
+#' @export
+lsi <- function(mat, n_dimensions = 50L, scale_factor = 1e4, save_in_memory = FALSE, threads = 1L) {
+  assert_is(mat, "IterableMatrix")
+  assert_is_wholenumber(n_dimensions)
+  assert_len(n_dimensions, 1)
+  assert_greater_than_zero(n_dimensions)
+  assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
+  assert_is_wholenumber(threads)
+
+  # log(tf-idf) transform
+  npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h`
+  tf <- mat %>% multiply_cols(1 / npeaks)
+  idf_ <- ncol(mat) / rowSums(mat)
+  mat_tfidf <- tf %>% multiply_rows(idf_)
+  mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
+  # Save to prevent re-calculation of queued operations
+  if (save_in_memory) {
+    mat_log_tfidf <- write_matrix_memory(mat_log_tfidf, compress = FALSE)
+  } else {
+    mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE)
+  } 
+  # Z-score normalization
+  cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance", threads = threads)$col_stats
+  cell_means <- cell_peak_stats["mean",]
+  cell_vars <- cell_peak_stats["variance",]
+  mat_lsi_norm <- mat_log_tfidf %>%
+    add_cols(-cell_means) %>%
+    multiply_cols(1 / cell_vars)
+  # Run pca
+  svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions, threads = threads)
+  pca_res <- t(svd_attr_$u) %*% mat_lsi_norm
+  return(pca_res)
+}
+
+#' Get the most variable features within a matrix
+#' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
+#' ll features will be returned.
+#' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
+#' and if the number of features
+#' within a bin is less than 2, the dispersion is set to 1.
+#' @returns IterableMatrix subset of the most variable features.
+#' @inheritParams lsi
+#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
+#' 
+#' Calculate using the following process:
+#'  1. Calculate the dispersion of each feature (variance / mean)
+#'  2. Log normalize dispersion and mean
+#'  3. Bin the features by their means, and normalize dispersion within each bin
+#' @export
+highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) {
+  assert_is(mat, "IterableMatrix")
+  assert_greater_than_zero(num_feats)
+  assert_is_wholenumber(num_feats)
+  assert_len(num_feats, 1)
+  assert_is_wholenumber(n_bins)
+  assert_len(n_bins, 1)
+  assert_greater_than_zero(n_bins)
+  if (nrow(mat) <= num_feats) {
+    log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
+    return(mat)
+  }
+  
+  feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats["mean", ]
+  feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats["variance", ]
+  feature_means[feature_means == 0] <- 1e-12
+  feature_dispersion <- feature_vars / feature_means
+  feature_dispersion[feature_dispersion == 0] <- NA
+  feature_dispersion <- log(feature_dispersion)
+  feature_means <- log1p(feature_means)
+  mean_bins <- cut(feature_means, n_bins, labels = FALSE)
+  
+  bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x, na.rm = TRUE))
+  bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x, na.rm = TRUE))
+  # Set feats that are in bins with only one feat to have a norm dispersion of 1
+  one_gene_bin <- is.na(bin_sd)
+  bin_sd[one_gene_bin] <- bin_mean[one_gene_bin]
+  bin_mean[one_gene_bin] <- 0
+  # map mean_bins indices to bin_stats
+  # Do a character search as bins without features mess up numeric indexing
+  feature_dispersion_norm <- (feature_dispersion - bin_mean[as.character(mean_bins)]) / bin_sd[as.character(mean_bins)]
+  names(feature_dispersion_norm) <- names(feature_dispersion)
+  feature_dispersion_norm <- sort(feature_dispersion_norm) # sorting automatically removes NA values
+  if (length(feature_dispersion_norm) < num_feats) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all non-zero features", length(feature_dispersion_norm), num_feats))
+  variable_features_ <- feature_dispersion_norm[max(1, (length(feature_dispersion_norm) - num_feats + 1)):length(feature_dispersion_norm)]
+  return(mat[names(variable_features_), ])
 }
\ No newline at end of file
diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd
new file mode 100644
index 00000000..54e67599
--- /dev/null
+++ b/r/man/highly_variable_features.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{highly_variable_features}
+\alias{highly_variable_features}
+\title{Get the most variable features within a matrix}
+\usage{
+highly_variable_features(mat, num_feats, n_bins, threads = 1L)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
+ll features will be returned.}
+
+\item{n_bins}{(integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin,
+and if the number of features
+within a bin is less than 2, the dispersion is set to 1.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+IterableMatrix subset of the most variable features.
+}
+\description{
+Get the most variable features within a matrix
+}
+\details{
+The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
+
+Calculate using the following process:
+\enumerate{
+\item Calculate the dispersion of each feature (variance / mean)
+\item Log normalize dispersion and mean
+\item Bin the features by their means, and normalize dispersion within each bin
+}
+}
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
new file mode 100644
index 00000000..38c0d73b
--- /dev/null
+++ b/r/man/lsi.Rd
@@ -0,0 +1,46 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{lsi}
+\alias{lsi}
+\title{Perform latent semantic indexing (LSI) on a matrix.}
+\usage{
+lsi(
+  mat,
+  n_dimensions = 50L,
+  scale_factor = 10000,
+  save_in_memory = FALSE,
+  threads = 1L
+)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
+
+\item{scale_factor}{(integer) Scale factor for the tf-idf log transform.}
+
+\item{save_in_memory}{(logical) If TRUE, save the log(tf-idf) matrix in memory.
+If FALSE, save to a temporary location in disk.  Saving in memory will result in faster downstream operations,
+but will require in higher memory usage.  Comparison of memory usage and speed is in the details section.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+dgCMatrix of shape (n_dimensions, ncol(mat)).
+}
+\description{
+Perform latent semantic indexing (LSI) on a matrix.
+}
+\details{
+Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
+
+** Saving in memory vs disk: **
+Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix.
+This is done to prevent re-calculation of queued operations during PCA optimization.
+
+Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
+\itemize{
+\item Saving in memory: 233 MB memory usage, 22.7 seconds runtime
+\item Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime
+}
+}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 526018e4..3cf6509a 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -136,6 +136,8 @@ reference:
   - select_features_by_variance
   - select_features_by_dispersion
   - select_features_by_mean
+  - lsi
+  - highly_variable_features
   - IterableMatrix-methods
   - pseudobulk_matrix
 
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 111324b7..c699d67b 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -13,6 +13,10 @@ generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val =
   as(m, "dgCMatrix")
 }
 
+generate_dense_matrix <- function(nrow, ncol) {
+  m <- matrix(runif(nrow * ncol), nrow = nrow)
+}
+
 test_that("select_features works general case", {
   m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix")
   for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) {
@@ -31,6 +35,26 @@ test_that("select_features works general case", {
   }
 })
 
+
+test_that("select_features works general case", {
+  m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix")
+  for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) {
+    res <- do.call(fn, list(m1, num_feats = 10))
+    expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting
+    expect_equal(sum(res$highly_variable), 10) # Only 10 features marked as highly variable
+    expect_setequal(res$names, rownames(m1))
+    res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows
+    res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) 
+    expect_identical(res_more_feats_than_rows, res_feats_equal_rows)
+    if (fn != "select_features_by_mean") {
+      # Check that normalization actually does something
+      res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL))
+      expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score))
+    }
+  }
+})
+
+
 test_that("Wilcoxon rank sum works (small)", {
     x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46)
     y <- c(1.15, 0.88, 0.90, 0.74, 1.21)
@@ -180,3 +204,28 @@ test_that("Pseudobulk aggregation works with multiple return types", {
   }
 })
 
+
+
+test_that("Highly variable feature selection works", {
+    mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix")
+    # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
+    res <- highly_variable_features(mat, num_feats = 10, n_bins = 5, threads = 1)
+    res_t <- highly_variable_features(t(mat), num_feats = 10, n_bins = 5, threads = 1)
+    expect_equal(nrow(res), 10)
+    expect_equal(ncol(res), 26)
+    expect_equal(nrow(res_t), 10)
+    expect_equal(ncol(res_t), 500)
+})
+
+test_that("LSI works", {
+    mat <- matrix(runif(240), nrow=10) %>% as("dgCMatrix") %>% as("IterableMatrix")
+    rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
+    colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
+    # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to ArchR
+    lsi_res <- lsi(mat, n_dimensions = 5)
+    lsi_res_t <- lsi(t(mat), n_dimensions = 5)
+    expect_equal(nrow(lsi_res), 5)
+    expect_equal(ncol(lsi_res), ncol(mat))
+    expect_equal(nrow(lsi_res_t), 5)
+    expect_equal(ncol(lsi_res_t), nrow(mat))
+})
\ No newline at end of file

From 2be2efebe4f53f03f9d895bdecc33f762babef13 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 6 Nov 2024 16:53:53 -0800
Subject: [PATCH 009/142] [r] parametrize z_score_norm, create temp option to
 return more info `save_lsi`, remove `save_in_memory` in `lsi()`

---
 r/R/singlecell_utils.R | 157 +++++++++++++++++++++++++++++------------
 r/man/lsi.Rd           |  24 ++++---
 2 files changed, 126 insertions(+), 55 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index c4d7fbf3..969acf35 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -237,25 +237,28 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 
 #' Perform latent semantic indexing (LSI) on a matrix.
 #' @param mat (IterableMatrix) dimensions features x cells
+#' @param z_score_norm (logical) If TRUE, z-score normalize the matrix before PCA.
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
 #' @param scale_factor (integer) Scale factor for the tf-idf log transform.
-#' @param save_in_memory (logical) If TRUE, save the log(tf-idf) matrix in memory.  
-#' If FALSE, save to a temporary location in disk.  Saving in memory will result in faster downstream operations,
-#' but will require in higher memory usage.  Comparison of memory usage and speed is in the details section.
 #' @param threads (integer) Number of threads to use.
-#' @return dgCMatrix of shape (n_dimensions, ncol(mat)).
+#' @param save_lsi (logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.
+#' @return 
+#' - If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)).
+#' - If save_lsi is TRUE, return a list with the following elements:
+#'  - **pca_res**: dgCMatrix of shape (n_dimensions, ncol(mat))
+#' - **svd_attr**: List of SVD attributes
+#' - **idf**: Inverse document frequency vector
 #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
 #' 
-#' ** Saving in memory vs disk: **
-#' Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix.
-#' This is done to prevent re-calculation of queued operations during PCA optimization.
-#' 
 #' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
-#' - Saving in memory: 233 MB memory usage, 22.7 seconds runtime
-#' - Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime
-#' 
+#' - 17.1 MB memory usage, 25.1 seconds runtime
 #' @export
-lsi <- function(mat, n_dimensions = 50L, scale_factor = 1e4, save_in_memory = FALSE, threads = 1L) {
+lsi <- function(
+  mat,
+  z_score_norm = TRUE, n_dimensions = 50L, scale_factor = 1e4,
+  save_lsi = FALSE,
+  threads = 1L
+) {
   assert_is(mat, "IterableMatrix")
   assert_is_wholenumber(n_dimensions)
   assert_len(n_dimensions, 1)
@@ -264,27 +267,34 @@ lsi <- function(mat, n_dimensions = 50L, scale_factor = 1e4, save_in_memory = FA
   assert_is_wholenumber(threads)
 
   # log(tf-idf) transform
+  mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
+  
   npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h`
   tf <- mat %>% multiply_cols(1 / npeaks)
   idf_ <- ncol(mat) / rowSums(mat)
   mat_tfidf <- tf %>% multiply_rows(idf_)
   mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
   # Save to prevent re-calculation of queued operations
-  if (save_in_memory) {
-    mat_log_tfidf <- write_matrix_memory(mat_log_tfidf, compress = FALSE)
-  } else {
-    mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE)
-  } 
+  mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE)
   # Z-score normalization
-  cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance", threads = threads)$col_stats
-  cell_means <- cell_peak_stats["mean",]
-  cell_vars <- cell_peak_stats["variance",]
-  mat_lsi_norm <- mat_log_tfidf %>%
-    add_cols(-cell_means) %>%
-    multiply_cols(1 / cell_vars)
+  if (z_score_norm) {
+    cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = threads)$col_stats
+    cell_means <- cell_peak_stats["mean",]
+    cell_vars <- cell_peak_stats["variance",]
+    mat_log_tfidf <- mat_log_tfidf %>%
+      add_cols(-cell_means) %>%
+      multiply_cols(1 / cell_vars)
+  }
   # Run pca
-  svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions, threads = threads)
-  pca_res <- t(svd_attr_$u) %*% mat_lsi_norm
+  svd_attr_ <- svds(mat_log_tfidf, k = n_dimensions, threads = threads)
+  pca_res <- t(svd_attr_$u) %*% mat_log_tfidf
+  if(save_lsi) {
+    return(list(
+      pca_res = pca_res,
+      svd_attr = svd_attr_,
+      idf = idf_
+    ))
+  }
   return(pca_res)
 }
 
@@ -315,28 +325,87 @@ highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) {
     log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
     return(mat)
   }
-  
-  feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats["mean", ]
-  feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats["variance", ]
+  mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
+  feature_means <- mat_stats$row_stats["mean", ]
+  feature_vars <- mat_stats$row_stats["variance", ]
   feature_means[feature_means == 0] <- 1e-12
   feature_dispersion <- feature_vars / feature_means
   feature_dispersion[feature_dispersion == 0] <- NA
   feature_dispersion <- log(feature_dispersion)
   feature_means <- log1p(feature_means)
-  mean_bins <- cut(feature_means, n_bins, labels = FALSE)
-  
-  bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x, na.rm = TRUE))
-  bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x, na.rm = TRUE))
-  # Set feats that are in bins with only one feat to have a norm dispersion of 1
-  one_gene_bin <- is.na(bin_sd)
-  bin_sd[one_gene_bin] <- bin_mean[one_gene_bin]
-  bin_mean[one_gene_bin] <- 0
-  # map mean_bins indices to bin_stats
-  # Do a character search as bins without features mess up numeric indexing
-  feature_dispersion_norm <- (feature_dispersion - bin_mean[as.character(mean_bins)]) / bin_sd[as.character(mean_bins)]
-  names(feature_dispersion_norm) <- names(feature_dispersion)
-  feature_dispersion_norm <- sort(feature_dispersion_norm) # sorting automatically removes NA values
-  if (length(feature_dispersion_norm) < num_feats) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all non-zero features", length(feature_dispersion_norm), num_feats))
-  variable_features_ <- feature_dispersion_norm[max(1, (length(feature_dispersion_norm) - num_feats + 1)):length(feature_dispersion_norm)]
-  return(mat[names(variable_features_), ])
+  features_df <- data.frame(
+    name = names(feature_means),
+    vars = feature_vars, 
+    means = feature_means,
+    dispersion = feature_dispersion
+  ) 
+  features_df <- features_df %>% 
+    dplyr::mutate(bin = cut(means, n_bins, labels=FALSE)) %>% 
+    dplyr::group_by(bin) %>% 
+    dplyr::mutate( 
+      bin_mean = mean(dispersion, na.rm = TRUE), 
+      bin_sd = sd(dispersion, na.rm = TRUE),
+      bin_sd_is_na = is.na(bin_sd), 
+      bin_sd = ifelse(bin_sd_is_na, bin_mean, bin_sd), # Set feats that are in bins with only one feat to have a norm dispersion of 1
+      bin_mean = ifelse(bin_sd_is_na, 0, bin_mean),
+      feature_dispersion_norm = (dispersion - bin_mean) / bin_sd
+    ) %>%
+    dplyr::ungroup() %>%
+    dplyr::select(name, feature_dispersion_norm) %>%
+    dplyr::arrange(desc(feature_dispersion_norm)) %>% 
+    dplyr::slice(1:min(num_feats, nrow(.)))
+  return(mat[features_df$name,])
+}
+
+                   
+#' Aggregate counts matrices by cell group or feature.
+#'
+#' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each
+#' feature.
+#' @param cell_groups (Character/factor) Vector of group/cluster assignments for each cell. Length must be `ncol(mat)`.
+#' @param method (Character vector) Method(s) to aggregate counts. If one method is provided, the output will be a matrix. If multiple methods are provided, the output will be a named list of matrices.
+#'
+#' Current options are: `nonzeros`, `sum`, `mean`, `variance`.
+#' @param threads (integer) Number of threads to use.
+#' @return
+#'  - If `method` is length `1`, returns a matrix of shape `(features x groups)`.
+#'  - If `method` is greater than length `1`, returns a list of matrices with each matrix representing a pseudobulk matrix with a different aggregation method.
+#' Each matrix is of shape `(features x groups)`, and names are one of `nonzeros`, `sum`, `mean`, `variance`.
+#' @details Some simpler stats are calculated in the process of calculating more complex
+#' statistics. So when calculating `variance`, `nonzeros` and `mean` can be included with no
+#' extra calculation time, and when calculating `mean`, adding `nonzeros` will take no extra time.
+#' @inheritParams marker_features
+#' @export
+pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
+  assert_is(mat, "IterableMatrix")
+  assert_is(cell_groups, c("factor", "character", "numeric"))
+  assert_true(length(cell_groups) == ncol(mat))
+  cell_groups <- as.factor(cell_groups)
+  assert_is(method, "character")
+  methods <- c("variance", "mean", "sum", "nonzeros")
+  for (m in method) {
+    if (!(m %in% methods)) {
+      rlang::abort(sprintf("method must be one of: %s", paste(methods, collapse = ", ")))
+    }
+  }
+  assert_is(threads, "integer")
+  # if multiple methods are provided, only need to pass in the top method as it will also calculate the less complex stats
+  iter <- iterate_matrix(parallel_split(mat, threads, threads*4))
+  res <- pseudobulk_matrix_cpp(iter, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose)
+  # if res is a single matrix, return with colnames and rownames
+  if (length(method) == 1) {
+    colnames(res[[method]]) <- levels(cell_groups)
+    rownames(res[[method]]) <- rownames(mat)
+    return(res[[method]])
+  }
+  # give colnames and rownames for each matrix in res, which is a named list
+  for (res_slot in names(res)) {
+    if ((length(res[[res_slot]]) == 0) || !(res_slot %in% method)) {
+      res[[res_slot]] <- NULL
+    } else {
+      colnames(res[[res_slot]]) <- levels(cell_groups)
+      rownames(res[[res_slot]]) <- rownames(mat)
+    }
+  }
+  return(res)
 }
\ No newline at end of file
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
index 38c0d73b..137e4437 100644
--- a/r/man/lsi.Rd
+++ b/r/man/lsi.Rd
@@ -6,27 +6,34 @@
 \usage{
 lsi(
   mat,
+  z_score_norm = TRUE,
   n_dimensions = 50L,
   scale_factor = 10000,
-  save_in_memory = FALSE,
+  save_lsi = FALSE,
   threads = 1L
 )
 }
 \arguments{
 \item{mat}{(IterableMatrix) dimensions features x cells}
 
+\item{z_score_norm}{(logical) If TRUE, z-score normalize the matrix before PCA.}
+
 \item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
 
 \item{scale_factor}{(integer) Scale factor for the tf-idf log transform.}
 
-\item{save_in_memory}{(logical) If TRUE, save the log(tf-idf) matrix in memory.
-If FALSE, save to a temporary location in disk.  Saving in memory will result in faster downstream operations,
-but will require in higher memory usage.  Comparison of memory usage and speed is in the details section.}
+\item{save_lsi}{(logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.}
 
 \item{threads}{(integer) Number of threads to use.}
 }
 \value{
-dgCMatrix of shape (n_dimensions, ncol(mat)).
+\itemize{
+\item If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)).
+\item If save_lsi is TRUE, return a list with the following elements:
+\item \strong{pca_res}: dgCMatrix of shape (n_dimensions, ncol(mat))
+\item \strong{svd_attr}: List of SVD attributes
+\item \strong{idf}: Inverse document frequency vector
+}
 }
 \description{
 Perform latent semantic indexing (LSI) on a matrix.
@@ -34,13 +41,8 @@ Perform latent semantic indexing (LSI) on a matrix.
 \details{
 Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
 
-** Saving in memory vs disk: **
-Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix.
-This is done to prevent re-calculation of queued operations during PCA optimization.
-
 Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
 \itemize{
-\item Saving in memory: 233 MB memory usage, 22.7 seconds runtime
-\item Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime
+\item 17.1 MB memory usage, 25.1 seconds runtime
 }
 }

From dccc3a56e67f104aa4e3c9e46e65fa0ba7c8c9d8 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 6 Nov 2024 16:54:18 -0800
Subject: [PATCH 010/142] [r] add test case for LSI comparing to archr

---
 r/tests/real_data/ArchR_LSI.R | 50 +++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 r/tests/real_data/ArchR_LSI.R

diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
new file mode 100644
index 00000000..b78d700f
--- /dev/null
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -0,0 +1,50 @@
+devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/")
+devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/")
+
+#' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells.
+#' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR.
+#' As the SVD implementation is not necessarily the same between the two packages, we take the SVD matrix
+#' from both functions and compare the matrix multiplication of the U and SVD matrices, which should give an approximation
+#' we can compare between the two packages.
+#' @param proj An archr project.
+test_lsi_similarity_to_archr <- function() {
+    # Set up temp dir
+    int_dir <- file.path(tempdir(), "insertion_test")
+    dir.create(int_dir)
+    setwd(int_dir)
+    # add iterative lsi for dim reduction
+    proj <- getTestProject()
+    proj <- addPeakMatrix(proj)
+    # Get the peak matrix
+    test_mat <- assay(getMatrixFromProject(proj, useMatrix = "PeakMatrix"))
+    # Calculate LSI on ArchR
+    # running LSI without binarizing, as we don't do this in the BPCells implementation
+    # we also don't filter quantile outliers.
+    lsi_archr <- .computeLSI(
+        mat = test_mat,
+        LSIMethod = 2,
+        nDimensions = 2,
+        binarize = FALSE,
+        outlierQuantiles = NULL,
+        test_mat = test_mat
+    )
+    svd_archr <- lsi_archr$svd
+    lsi_mat_archr <- t(lsi_archr$matSVD)
+    # set rownames to NA, as we don't have rownames in the BPCells implementation
+    rownames(lsi_mat_archr) <- NULL
+    # PCA Matrix = T(u) * Pre-SVD Matrix
+    # u * PCA Matrix = u * T(u) * Pre-SVD Matrix
+    # u * PCA Matrix = Pre-SVD Matrix
+    pre_svd_mat_approx_archr <- lsi_archr$svd$u %*% lsi_mat_archr
+    # Calculate LSI on BPCells
+    # Do not use z-score normalization, as this isn't done with ArchR
+    lsi_bpcells <- lsi(
+        test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), 
+        z_score_norm = FALSE,
+        n_dimensions = 2,
+        save_lsi = TRUE
+    )
+    pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res
+    testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6))
+}
+test_lsi_similarity_to_archr()
\ No newline at end of file

From 183dd40a18cddae2d353a1a439d73cfb26af6e5b Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 6 Nov 2024 19:29:32 -0800
Subject: [PATCH 011/142] [r] clean up var gene selection, lsi docstring

---
 r/R/singlecell_utils.R            | 38 ++++++++++++++++++++++++-------
 r/man/highly_variable_features.Rd | 17 ++++++++++++--
 r/man/lsi.Rd                      |  6 ++---
 3 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 969acf35..5a00462b 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -236,6 +236,8 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 }
 
 #' Perform latent semantic indexing (LSI) on a matrix.
+#' 
+#' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
 #' @param mat (IterableMatrix) dimensions features x cells
 #' @param z_score_norm (logical) If TRUE, z-score normalize the matrix before PCA.
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
@@ -243,9 +245,9 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 #' @param threads (integer) Number of threads to use.
 #' @param save_lsi (logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.
 #' @return 
-#' - If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)).
+#' - If save_lsi is FALSE, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
 #' - If save_lsi is TRUE, return a list with the following elements:
-#'  - **pca_res**: dgCMatrix of shape (n_dimensions, ncol(mat))
+#'  - **pca_res**: dgCMatrix of shape `(n_dimensions, ncol(mat))``
 #' - **svd_attr**: List of SVD attributes
 #' - **idf**: Inverse document frequency vector
 #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
@@ -304,7 +306,12 @@ lsi <- function(
 #' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
 #' and if the number of features
 #' within a bin is less than 2, the dispersion is set to 1.
-#' @returns IterableMatrix subset of the most variable features.
+#' @param save_feat_selection (logical) If TRUE, save the dispersions, means, and the features selected.
+#' @returns 
+#' - If `save_feat_selection` is False, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
+#' - If `save_feat_selection` is True, return a list with the following elements:
+#' - **mat**: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
+#' - **feature_selection**: Dataframe with the following columns:
 #' @inheritParams lsi
 #' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
 #' 
@@ -313,7 +320,11 @@ lsi <- function(
 #'  2. Log normalize dispersion and mean
 #'  3. Bin the features by their means, and normalize dispersion within each bin
 #' @export
-highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) {
+highly_variable_features <- function(
+  mat, num_feats, n_bins = 20,
+  save_feat_selection = FALSE,
+  threads = 1L
+) {
   assert_is(mat, "IterableMatrix")
   assert_greater_than_zero(num_feats)
   assert_is_wholenumber(num_feats)
@@ -325,22 +336,27 @@ highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) {
     log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
     return(mat)
   }
+  # Calculate row information for dispersion
   mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
   feature_means <- mat_stats$row_stats["mean", ]
   feature_vars <- mat_stats$row_stats["variance", ]
+  # Give a small value to features with 0 mean, helps with 0 division
   feature_means[feature_means == 0] <- 1e-12
+  # Calculate dispersion, and log normalize
   feature_dispersion <- feature_vars / feature_means
   feature_dispersion[feature_dispersion == 0] <- NA
   feature_dispersion <- log(feature_dispersion)
   feature_means <- log1p(feature_means)
   features_df <- data.frame(
     name = names(feature_means),
-    vars = feature_vars, 
-    means = feature_means,
+    var = feature_vars, 
+    mean = feature_means,
     dispersion = feature_dispersion
   ) 
+
+  # Bin by mean, and normalize dispersion with each bin
   features_df <- features_df %>% 
-    dplyr::mutate(bin = cut(means, n_bins, labels=FALSE)) %>% 
+    dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% 
     dplyr::group_by(bin) %>% 
     dplyr::mutate( 
       bin_mean = mean(dispersion, na.rm = TRUE), 
@@ -351,9 +367,15 @@ highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) {
       feature_dispersion_norm = (dispersion - bin_mean) / bin_sd
     ) %>%
     dplyr::ungroup() %>%
-    dplyr::select(name, feature_dispersion_norm) %>%
+    dplyr::select(c(-bin_sd_is_na, -var, -bin_sd, -bin_mean)) %>%
     dplyr::arrange(desc(feature_dispersion_norm)) %>% 
     dplyr::slice(1:min(num_feats, nrow(.)))
+  if (save_feat_selection) {
+    return(list(
+      mat = mat[features_df$name,],
+      feature_selection = features_df
+    ))
+  }
   return(mat[features_df$name,])
 }
 
diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd
index 54e67599..943d986b 100644
--- a/r/man/highly_variable_features.Rd
+++ b/r/man/highly_variable_features.Rd
@@ -4,7 +4,13 @@
 \alias{highly_variable_features}
 \title{Get the most variable features within a matrix}
 \usage{
-highly_variable_features(mat, num_feats, n_bins, threads = 1L)
+highly_variable_features(
+  mat,
+  num_feats,
+  n_bins = 20,
+  save_feat_selection = FALSE,
+  threads = 1L
+)
 }
 \arguments{
 \item{mat}{(IterableMatrix) dimensions features x cells}
@@ -16,10 +22,17 @@ ll features will be returned.}
 and if the number of features
 within a bin is less than 2, the dispersion is set to 1.}
 
+\item{save_feat_selection}{(logical) If TRUE, save the dispersions, means, and the features selected.}
+
 \item{threads}{(integer) Number of threads to use.}
 }
 \value{
-IterableMatrix subset of the most variable features.
+\itemize{
+\item If \code{save_feat_selection} is False, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
+\item If \code{save_feat_selection} is True, return a list with the following elements:
+\item \strong{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
+\item \strong{feature_selection}: Dataframe with the following columns:
+}
 }
 \description{
 Get the most variable features within a matrix
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
index 137e4437..299513df 100644
--- a/r/man/lsi.Rd
+++ b/r/man/lsi.Rd
@@ -28,15 +28,15 @@ lsi(
 }
 \value{
 \itemize{
-\item If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)).
+\item If save_lsi is FALSE, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}.
 \item If save_lsi is TRUE, return a list with the following elements:
-\item \strong{pca_res}: dgCMatrix of shape (n_dimensions, ncol(mat))
+\item \strong{pca_res}: dgCMatrix of shape `(n_dimensions, ncol(mat))``
 \item \strong{svd_attr}: List of SVD attributes
 \item \strong{idf}: Inverse document frequency vector
 }
 }
 \description{
-Perform latent semantic indexing (LSI) on a matrix.
+Given a \verb{(features x cells)} matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
 }
 \details{
 Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.

From 4972f3430ab9da12bbde231699a9c52b92ad0cec Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 6 Nov 2024 19:30:00 -0800
Subject: [PATCH 012/142] [r] add variable gene selection test

---
 r/tests/real_data/ArchR_LSI.R                 |  9 +++-
 .../Scanpy_variable_feat_selection.py         | 46 ++++++++++++++++++
 .../scanpy_variable_feat_selection.R          | 47 +++++++++++++++++++
 3 files changed, 101 insertions(+), 1 deletion(-)
 create mode 100644 r/tests/real_data/Scanpy_variable_feat_selection.py
 create mode 100644 r/tests/real_data/scanpy_variable_feat_selection.R

diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
index b78d700f..2ad74610 100644
--- a/r/tests/real_data/ArchR_LSI.R
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -1,3 +1,11 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
 devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/")
 devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/")
 
@@ -47,4 +55,3 @@ test_lsi_similarity_to_archr <- function() {
     pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res
     testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6))
 }
-test_lsi_similarity_to_archr()
\ No newline at end of file
diff --git a/r/tests/real_data/Scanpy_variable_feat_selection.py b/r/tests/real_data/Scanpy_variable_feat_selection.py
new file mode 100644
index 00000000..6ae28c1b
--- /dev/null
+++ b/r/tests/real_data/Scanpy_variable_feat_selection.py
@@ -0,0 +1,46 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+import sys, tempfile, os
+import numpy as np
+import pandas as pd
+import scanpy as sc
+
+
+def call_highly_var_genes_single_batch(temp_dir: str) -> None:
+    """
+    Call highly_variable genes on a single batch of PBMC68k data using their interpreation of 
+    the Seurat implementation.
+    Write the input anndata object csv at `<temp_dir>/highly_var_genes_scanpy_input.csv`
+    Write the output as a csv, at `<temp_dir>/highly_var_genes_scanpy_output.csv`
+
+    Args:
+        temp_dir (str): Path to the temporary directory to write the input and output files.
+    """
+    # Dataset is only (765, 700)
+    adata = sc.datasets.pbmc68k_reduced()
+    adata.var_names_make_unique()
+    res = sc.pp._highly_variable_genes.highly_variable_genes(adata, 
+                                                             n_top_genes = 50,
+                                                             n_bins = 20, 
+                                                             flavor = "seurat",
+                                                             inplace = False,
+                                                             check_values = False).drop(columns = 'mean_bin')
+    # remove mean_bin
+    adata.to_df().to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_input.csv"))
+    res.to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_output.csv"))
+
+
+if __name__ == "__main__":
+    # We use the first argument as the temporary directory
+    if len(sys.argv) < 2:
+        # If no argument is provided, use the current directory
+        call_highly_var_genes_single_batch(".")
+    else:
+        call_highly_var_genes_single_batch(sys.argv[1])
+    
\ No newline at end of file
diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R
new file mode 100644
index 00000000..da873a7b
--- /dev/null
+++ b/r/tests/real_data/scanpy_variable_feat_selection.R
@@ -0,0 +1,47 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r")
+
+compare_feat_selection_to_scanpy <- function(dir = NULL) {
+    # Set up temp dir
+    if (is.null(dir)) {
+        dir <- file.path(tempdir(), "feat_selection_test")
+        if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+        dir.create(dir)
+    }
+
+    # Call python script
+    system2("python", c("Scanpy_variable_feat_selection.py", dir))
+    
+    # read in input csv
+    input_mat_scanpy <- t(read.csv(file.path(dir, "highly_var_genes_scanpy_input.csv"), row.names = 1))
+    output_mat_scanpy <- read.csv(file.path(dir, "highly_var_genes_scanpy_output.csv"), row.names = 1)
+    # filter output mat to only where "highly_variable" is true
+    output_mat_scanpy$highly_variable <- as.logical(output_mat_scanpy$highly_variable)
+    output_mat_scanpy <- output_mat_scanpy[output_mat_scanpy$highly_variable,] %>% 
+        dplyr::arrange(desc(dispersions_norm)) %>% 
+        dplyr::select(-highly_variable) %>% # convert rownames to a column
+        tibble::rownames_to_column("name") %>%
+        dplyr::as_tibble()
+    # unlog the input_mat
+    input_mat_bpcells <- expm1(input_mat_scanpy)
+    output_bpcells <- highly_variable_features(
+        input_mat_bpcells %>% as("dgCMatrix") %>% as("IterableMatrix"),
+        num_feats = 50,
+        n_bins = 20,
+        save_feat_selection = TRUE
+    )
+    output_mat_bpcells <- output_bpcells$feature_selection
+    expect_true(all.equal(output_mat_bpcells$name, output_mat_scanpy$name))
+    expect_true(all.equal(output_mat_bpcells$mean, output_mat_scanpy$means, tolerance = 1e-6))
+    expect_true(all.equal(output_mat_bpcells$dispersion, output_mat_scanpy$dispersions, tolerance = 1e-6))
+    expect_true(all.equal(output_mat_bpcells$feature_dispersion_norm, output_mat_scanpy$dispersions_norm, tolerance = 1e-6))
+}
+
+compare_feat_selection_to_scanpy()

From e4d5cb01a67f7b2c1ea40fd489c825161bb78e83 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 6 Nov 2024 19:35:03 -0800
Subject: [PATCH 013/142] [r] provide more colour to scanpy feat selection test

---
 r/tests/real_data/scanpy_variable_feat_selection.R | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R
index da873a7b..3bac07ab 100644
--- a/r/tests/real_data/scanpy_variable_feat_selection.R
+++ b/r/tests/real_data/scanpy_variable_feat_selection.R
@@ -8,6 +8,11 @@
 
 devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r")
 
+
+# Compare the feature selection output of BPCells to that of Scanpy.
+# Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. 
+# This function calls a python script that runs Scanpy feature selection on a test dataset, and writes both input/output to `dir`.
+# It then reads in the input/output from the python script, calls the BPCells feature selection function, and compares the output to the Scanpy output.
 compare_feat_selection_to_scanpy <- function(dir = NULL) {
     # Set up temp dir
     if (is.null(dir)) {
@@ -29,8 +34,10 @@ compare_feat_selection_to_scanpy <- function(dir = NULL) {
         dplyr::select(-highly_variable) %>% # convert rownames to a column
         tibble::rownames_to_column("name") %>%
         dplyr::as_tibble()
-    # unlog the input_mat
+
+    # Scanpy undoes a log1p transformation on the input matrix, so we do the same here
     input_mat_bpcells <- expm1(input_mat_scanpy)
+    
     output_bpcells <- highly_variable_features(
         input_mat_bpcells %>% as("dgCMatrix") %>% as("IterableMatrix"),
         num_feats = 50,

From 99470e06b04bd269468e093688e67b25e0a47ffb Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 7 Nov 2024 15:57:13 -0800
Subject: [PATCH 014/142] [r] cleanup real data tests

---
 r/tests/real_data/ArchR_LSI.R                 | 17 +++++++-------
 r/tests/real_data/ArchR_insertions.R          | 13 +++++++++--
 r/tests/real_data/config.csv                  |  3 +++
 .../scanpy_variable_feat_selection.R          | 13 ++++-------
 r/tests/real_data/test_helpers.R              | 22 +++++++++++++++++++
 5 files changed, 48 insertions(+), 20 deletions(-)
 create mode 100644 r/tests/real_data/config.csv
 create mode 100644 r/tests/real_data/test_helpers.R

diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
index 2ad74610..e79ee98b 100644
--- a/r/tests/real_data/ArchR_LSI.R
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -6,8 +6,9 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
-devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/")
-devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/")
+source("test_helpers.R")
+devtools::load_all(config[["path_bpcells"]])
+devtools::load_all(config[["path_archr"]])
 
 #' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells.
 #' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR.
@@ -15,11 +16,9 @@ devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/")
 #' from both functions and compare the matrix multiplication of the U and SVD matrices, which should give an approximation
 #' we can compare between the two packages.
 #' @param proj An archr project.
-test_lsi_similarity_to_archr <- function() {
-    # Set up temp dir
-    int_dir <- file.path(tempdir(), "insertion_test")
-    dir.create(int_dir)
-    setwd(int_dir)
+test_lsi_similarity_to_archr <- function(dir = NULL) {
+    dir <- create_temp_dir(dir)
+    setwd(dir)
     # add iterative lsi for dim reduction
     proj <- getTestProject()
     proj <- addPeakMatrix(proj)
@@ -33,8 +32,7 @@ test_lsi_similarity_to_archr <- function() {
         LSIMethod = 2,
         nDimensions = 2,
         binarize = FALSE,
-        outlierQuantiles = NULL,
-        test_mat = test_mat
+        outlierQuantiles = NULL
     )
     svd_archr <- lsi_archr$svd
     lsi_mat_archr <- t(lsi_archr$matSVD)
@@ -55,3 +53,4 @@ test_lsi_similarity_to_archr <- function() {
     pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res
     testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6))
 }
+test_lsi_similarity_to_archr()
\ No newline at end of file
diff --git a/r/tests/real_data/ArchR_insertions.R b/r/tests/real_data/ArchR_insertions.R
index 150939d4..810133c1 100644
--- a/r/tests/real_data/ArchR_insertions.R
+++ b/r/tests/real_data/ArchR_insertions.R
@@ -1,5 +1,14 @@
-devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/")
-devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/")
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+source("test_helpers.R")
+devtools::load_all(config[["path_bpcells"]])
+devtools::load_all(config[["path_archr"]])
 
 fix_granges_syntax_for_archr <- function(gr) {
     mcols(gr)$RG <- gsub("PBSmall#", "", mcols(gr)$RG)
diff --git a/r/tests/real_data/config.csv b/r/tests/real_data/config.csv
new file mode 100644
index 00000000..915de4af
--- /dev/null
+++ b/r/tests/real_data/config.csv
@@ -0,0 +1,3 @@
+key,value
+path_archr,/mnt/c/users/Imman/PycharmProjects/ArchR/
+path_bpcells,/mnt/c/users/Imman/PycharmProjects/BPCells/r
diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R
index 3bac07ab..8b608773 100644
--- a/r/tests/real_data/scanpy_variable_feat_selection.R
+++ b/r/tests/real_data/scanpy_variable_feat_selection.R
@@ -6,23 +6,18 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
-devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r")
-
+source("test_helpers.R")
+devtools::load_all(config[["path_bpcells"]])
 
 # Compare the feature selection output of BPCells to that of Scanpy.
 # Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. 
 # This function calls a python script that runs Scanpy feature selection on a test dataset, and writes both input/output to `dir`.
 # It then reads in the input/output from the python script, calls the BPCells feature selection function, and compares the output to the Scanpy output.
 compare_feat_selection_to_scanpy <- function(dir = NULL) {
-    # Set up temp dir
-    if (is.null(dir)) {
-        dir <- file.path(tempdir(), "feat_selection_test")
-        if (dir.exists(dir)) unlink(dir, recursive = TRUE)
-        dir.create(dir)
-    }
+    dir <- create_temp_dir(dir)
 
     # Call python script
-    system2("python", c("Scanpy_variable_feat_selection.py", dir))
+    system2("python3", c("Scanpy_variable_feat_selection.py", dir))
     
     # read in input csv
     input_mat_scanpy <- t(read.csv(file.path(dir, "highly_var_genes_scanpy_input.csv"), row.names = 1))
diff --git a/r/tests/real_data/test_helpers.R b/r/tests/real_data/test_helpers.R
new file mode 100644
index 00000000..2da80072
--- /dev/null
+++ b/r/tests/real_data/test_helpers.R
@@ -0,0 +1,22 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+# Set up k-v pairs for other tests
+config_csv <- read.csv("config.csv")
+config <- as.list(config_csv)$value
+names(config) <- as.list(config_csv)$key
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
\ No newline at end of file

From 3bf8914f987f5d88d558f48539e6d65ef349b889 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 7 Nov 2024 16:15:54 -0800
Subject: [PATCH 015/142] [r] clean up lsi, var features docstrings

---
 r/R/singlecell_utils.R            | 28 ++++++++++++++--------------
 r/man/highly_variable_features.Rd | 18 ++++++++++--------
 r/man/lsi.Rd                      | 19 ++++++++++---------
 3 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 5a00462b..13c697af 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -239,17 +239,17 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 #' 
 #' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
 #' @param mat (IterableMatrix) dimensions features x cells
-#' @param z_score_norm (logical) If TRUE, z-score normalize the matrix before PCA.
+#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA.
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
 #' @param scale_factor (integer) Scale factor for the tf-idf log transform.
+#' #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
 #' @param threads (integer) Number of threads to use.
-#' @param save_lsi (logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.
 #' @return 
-#' - If save_lsi is FALSE, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
-#' - If save_lsi is TRUE, return a list with the following elements:
-#'  - **pca_res**: dgCMatrix of shape `(n_dimensions, ncol(mat))``
-#' - **svd_attr**: List of SVD attributes
-#' - **idf**: Inverse document frequency vector
+#' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
+#' - If `save_lsi` is `TRUE`, return a list with the following elements:
+#'    - `pca_res`: dgCMatrix of shape `(n_dimensions, ncol(mat))`
+#'    - `svd_attr`: List of SVD attributes
+#'    - `idf`: Inverse document frequency vector
 #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
 #' 
 #' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
@@ -300,18 +300,18 @@ lsi <- function(
   return(pca_res)
 }
 
-#' Get the most variable features within a matrix
+#' Get the most variable features within a matrix.
 #' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
-#' ll features will be returned.
+#' all features will be returned.
 #' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
 #' and if the number of features
 #' within a bin is less than 2, the dispersion is set to 1.
-#' @param save_feat_selection (logical) If TRUE, save the dispersions, means, and the features selected.
+#' @param save_feat_selection (logical) If `TRUE`, save the dispersions, means, and the features selected.
 #' @returns 
-#' - If `save_feat_selection` is False, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
-#' - If `save_feat_selection` is True, return a list with the following elements:
-#' - **mat**: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
-#' - **feature_selection**: Dataframe with the following columns:
+#' - If `save_feat_selection` is `FALSE`, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
+#' - If `save_feat_selection` is `TRUE`, return a list with the following elements:
+#'    - `mat`: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
+#'    - `feature_selection`: Dataframe with columns `name`, `mean`, `dispersion`, `bin`, `feature_dispersion_norm`.
 #' @inheritParams lsi
 #' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
 #' 
diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd
index 943d986b..005a4e37 100644
--- a/r/man/highly_variable_features.Rd
+++ b/r/man/highly_variable_features.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/singlecell_utils.R
 \name{highly_variable_features}
 \alias{highly_variable_features}
-\title{Get the most variable features within a matrix}
+\title{Get the most variable features within a matrix.}
 \usage{
 highly_variable_features(
   mat,
@@ -16,26 +16,28 @@ highly_variable_features(
 \item{mat}{(IterableMatrix) dimensions features x cells}
 
 \item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
-ll features will be returned.}
+all features will be returned.}
 
 \item{n_bins}{(integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin,
 and if the number of features
 within a bin is less than 2, the dispersion is set to 1.}
 
-\item{save_feat_selection}{(logical) If TRUE, save the dispersions, means, and the features selected.}
+\item{save_feat_selection}{(logical) If \code{TRUE}, save the dispersions, means, and the features selected.}
 
 \item{threads}{(integer) Number of threads to use.}
 }
 \value{
 \itemize{
-\item If \code{save_feat_selection} is False, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
-\item If \code{save_feat_selection} is True, return a list with the following elements:
-\item \strong{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
-\item \strong{feature_selection}: Dataframe with the following columns:
+\item If \code{save_feat_selection} is \code{FALSE}, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
+\item If \code{save_feat_selection} is \code{TRUE}, return a list with the following elements:
+\itemize{
+\item \code{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
+\item \code{feature_selection}: Dataframe with columns \code{name}, \code{mean}, \code{dispersion}, \code{bin}, \code{feature_dispersion_norm}.
+}
 }
 }
 \description{
-Get the most variable features within a matrix
+Get the most variable features within a matrix.
 }
 \details{
 The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
index 299513df..ea2687f9 100644
--- a/r/man/lsi.Rd
+++ b/r/man/lsi.Rd
@@ -16,23 +16,24 @@ lsi(
 \arguments{
 \item{mat}{(IterableMatrix) dimensions features x cells}
 
-\item{z_score_norm}{(logical) If TRUE, z-score normalize the matrix before PCA.}
+\item{z_score_norm}{(logical) If \code{TRUE}, z-score normalize the matrix before PCA.}
 
 \item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
 
-\item{scale_factor}{(integer) Scale factor for the tf-idf log transform.}
-
-\item{save_lsi}{(logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.}
+\item{scale_factor}{(integer) Scale factor for the tf-idf log transform.
+#' @param save_lsi (logical) If \code{TRUE}, save the SVD attributes for the matrix, as well as the idf normalization vector.}
 
 \item{threads}{(integer) Number of threads to use.}
 }
 \value{
 \itemize{
-\item If save_lsi is FALSE, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}.
-\item If save_lsi is TRUE, return a list with the following elements:
-\item \strong{pca_res}: dgCMatrix of shape `(n_dimensions, ncol(mat))``
-\item \strong{svd_attr}: List of SVD attributes
-\item \strong{idf}: Inverse document frequency vector
+\item If \code{save_lsi} is \code{FALSE}, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}.
+\item If \code{save_lsi} is \code{TRUE}, return a list with the following elements:
+\itemize{
+\item \code{pca_res}: dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}
+\item \code{svd_attr}: List of SVD attributes
+\item \code{idf}: Inverse document frequency vector
+}
 }
 }
 \description{

From a7c617986ef86e6ce50a7db7af882f8d3e283f15 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 7 Nov 2024 18:45:32 -0800
Subject: [PATCH 016/142] [r] add in more lsi real data tests

---
 r/tests/real_data/ArchR_LSI.R | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
index e79ee98b..09c04f03 100644
--- a/r/tests/real_data/ArchR_LSI.R
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -30,7 +30,7 @@ test_lsi_similarity_to_archr <- function(dir = NULL) {
     lsi_archr <- .computeLSI(
         mat = test_mat,
         LSIMethod = 2,
-        nDimensions = 2,
+        nDimensions = 20,
         binarize = FALSE,
         outlierQuantiles = NULL
     )
@@ -47,10 +47,16 @@ test_lsi_similarity_to_archr <- function(dir = NULL) {
     lsi_bpcells <- lsi(
         test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), 
         z_score_norm = FALSE,
-        n_dimensions = 2,
+        n_dimensions = 20,
         save_lsi = TRUE
     )
     pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res
-    testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6))
+    testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-4))
+    # convert signs
+    lsi_mat_archr <- sweep(lsi_mat_archr, MARGIN = 1,  (2 * (lsi_mat_archr[,1] * lsi_bpcells$pca_res[,1] > 0) - 1), `*`)
+    # Check for post-pca matrix similarity
+    testthat::expect_true(all.equal(lsi_mat_archr, lsi_bpcells$pca_res, tolerance = 1e-4))
+    # also check for correlation between the two matrices in PC space
+    testthat::expect_true(cor(as.vector(lsi_mat_archr), as.vector(lsi_bpcells$pca_res)) > 0.999)
 }
 test_lsi_similarity_to_archr()
\ No newline at end of file

From acf35b2520efc9aa70fe005565837bede6888e8d Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 18 Nov 2024 12:56:18 -0800
Subject: [PATCH 017/142] [r] remove unused variable from `lsi()`

---
 r/R/singlecell_utils.R | 2 --
 1 file changed, 2 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 13c697af..a59d1d73 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -269,8 +269,6 @@ lsi <- function(
   assert_is_wholenumber(threads)
 
   # log(tf-idf) transform
-  mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
-  
   npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h`
   tf <- mat %>% multiply_cols(1 / npeaks)
   idf_ <- ncol(mat) / rowSums(mat)

From 47256dbaf0407641c73a3c36a33479ee6fe40fc2 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 2 Dec 2024 02:12:19 -0800
Subject: [PATCH 018/142] [r] add requested changes

---
 r/R/singlecell_utils.R                        | 37 ++++++++++---------
 r/tests/real_data/ArchR_LSI.R                 | 15 ++++++--
 r/tests/real_data/config.csv                  |  3 --
 .../scanpy_variable_feat_selection.R          | 13 ++++++-
 r/tests/real_data/test_helpers.R              | 22 -----------
 5 files changed, 42 insertions(+), 48 deletions(-)
 delete mode 100644 r/tests/real_data/config.csv
 delete mode 100644 r/tests/real_data/test_helpers.R

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index a59d1d73..9be501fc 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -239,8 +239,8 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 #' 
 #' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
 #' @param mat (IterableMatrix) dimensions features x cells
-#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA.
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
+#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA.
 #' @param scale_factor (integer) Scale factor for the tf-idf log transform.
 #' #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
 #' @param threads (integer) Number of threads to use.
@@ -257,7 +257,7 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 #' @export
 lsi <- function(
   mat,
-  z_score_norm = TRUE, n_dimensions = 50L, scale_factor = 1e4,
+  n_dimensions = 50L, z_score_norm = TRUE,  scale_factor = 1e4,
   save_lsi = FALSE,
   threads = 1L
 ) {
@@ -269,13 +269,17 @@ lsi <- function(
   assert_is_wholenumber(threads)
 
   # log(tf-idf) transform
-  npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h`
+  mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
+  npeaks <- mat_stats$col_stats["mean",] * nrow(mat) 
   tf <- mat %>% multiply_cols(1 / npeaks)
-  idf_ <- ncol(mat) / rowSums(mat)
+  idf_ <- ncol(mat) / (mat_stats$row_stats["mean",] * nrow(mat))
   mat_tfidf <- tf %>% multiply_rows(idf_)
   mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
   # Save to prevent re-calculation of queued operations
-  mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE)
+  mat_log_tfidf <- write_matrix_dir(
+    convert_matrix_type(mat_log_tfidf, type = "float"), 
+    tempfile("mat_log_tfidf"), compress = TRUE
+  )
   # Z-score normalization
   if (z_score_norm) {
     cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = threads)$col_stats
@@ -338,12 +342,11 @@ highly_variable_features <- function(
   mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
   feature_means <- mat_stats$row_stats["mean", ]
   feature_vars <- mat_stats$row_stats["variance", ]
-  # Give a small value to features with 0 mean, helps with 0 division
-  feature_means[feature_means == 0] <- 1e-12
   # Calculate dispersion, and log normalize
   feature_dispersion <- feature_vars / feature_means
   feature_dispersion[feature_dispersion == 0] <- NA
   feature_dispersion <- log(feature_dispersion)
+  feature_dispersion[feature_means == 0] <- 0
   feature_means <- log1p(feature_means)
   features_df <- data.frame(
     name = names(feature_means),
@@ -357,24 +360,22 @@ highly_variable_features <- function(
     dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% 
     dplyr::group_by(bin) %>% 
     dplyr::mutate( 
-      bin_mean = mean(dispersion, na.rm = TRUE), 
-      bin_sd = sd(dispersion, na.rm = TRUE),
-      bin_sd_is_na = is.na(bin_sd), 
-      bin_sd = ifelse(bin_sd_is_na, bin_mean, bin_sd), # Set feats that are in bins with only one feat to have a norm dispersion of 1
-      bin_mean = ifelse(bin_sd_is_na, 0, bin_mean),
-      feature_dispersion_norm = (dispersion - bin_mean) / bin_sd
-    ) %>%
+      feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion),  
+      feature_dispersion_norm = dplyr::if_else(n() == 1, 1, feature_dispersion_norm) # Set feats that are in bins with only one feat to have a norm dispersion of 1  
+    ) %>%  
     dplyr::ungroup() %>%
     dplyr::select(c(-bin_sd_is_na, -var, -bin_sd, -bin_mean)) %>%
-    dplyr::arrange(desc(feature_dispersion_norm)) %>% 
-    dplyr::slice(1:min(num_feats, nrow(.)))
+    dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats)
   if (save_feat_selection) {
+    # get rownames that are in features_df$name
+    feats_of_interest <- which(rownames(mat) %in% features_df$name)
     return(list(
-      mat = mat[features_df$name,],
+      mat = mat[feats_of_interest,],
       feature_selection = features_df
     ))
   }
-  return(mat[features_df$name,])
+  return(mat[feats_of_interest,])
+  #return(mat[features_df$name,])
 }
 
                    
diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
index 09c04f03..ff697b4c 100644
--- a/r/tests/real_data/ArchR_LSI.R
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -6,9 +6,18 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
-source("test_helpers.R")
-devtools::load_all(config[["path_bpcells"]])
-devtools::load_all(config[["path_archr"]])
+library("BPCells")
+library("ArchR")
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
 
 #' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells.
 #' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR.
diff --git a/r/tests/real_data/config.csv b/r/tests/real_data/config.csv
deleted file mode 100644
index 915de4af..00000000
--- a/r/tests/real_data/config.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-key,value
-path_archr,/mnt/c/users/Imman/PycharmProjects/ArchR/
-path_bpcells,/mnt/c/users/Imman/PycharmProjects/BPCells/r
diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R
index 8b608773..d2eb08b2 100644
--- a/r/tests/real_data/scanpy_variable_feat_selection.R
+++ b/r/tests/real_data/scanpy_variable_feat_selection.R
@@ -6,8 +6,17 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
-source("test_helpers.R")
-devtools::load_all(config[["path_bpcells"]])
+library("BPCells")
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
 
 # Compare the feature selection output of BPCells to that of Scanpy.
 # Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. 
diff --git a/r/tests/real_data/test_helpers.R b/r/tests/real_data/test_helpers.R
deleted file mode 100644
index 2da80072..00000000
--- a/r/tests/real_data/test_helpers.R
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright 2024 BPCells contributors
-# 
-# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
-# option. This file may not be copied, modified, or distributed
-# except according to those terms.
-
-# Set up k-v pairs for other tests
-config_csv <- read.csv("config.csv")
-config <- as.list(config_csv)$value
-names(config) <- as.list(config_csv)$key
-
-# Set up temp dir in case it's not already set
-create_temp_dir <- function(dir = NULL) {
-  if (is.null(dir)) {
-    dir <- file.path(tempdir(), "lsi_test")
-    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
-    dir.create(dir)
-  }
-  return(dir)
-}
\ No newline at end of file

From 004499a0a7119f05bb5fc5cf3fc46058ebb3f94c Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 2 Dec 2024 13:54:25 -0800
Subject: [PATCH 019/142] [r] fix requested changes

---
 r/R/singlecell_utils.R | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 9be501fc..b3d873db 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -270,6 +270,7 @@ lsi <- function(
 
   # log(tf-idf) transform
   mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
+  
   npeaks <- mat_stats$col_stats["mean",] * nrow(mat) 
   tf <- mat %>% multiply_cols(1 / npeaks)
   idf_ <- ncol(mat) / (mat_stats$row_stats["mean",] * nrow(mat))
@@ -360,22 +361,20 @@ highly_variable_features <- function(
     dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% 
     dplyr::group_by(bin) %>% 
     dplyr::mutate( 
-      feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion),  
-      feature_dispersion_norm = dplyr::if_else(n() == 1, 1, feature_dispersion_norm) # Set feats that are in bins with only one feat to have a norm dispersion of 1  
+      feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion),
+      feature_dispersion_norm = if (dplyr::n() == 1) {1} else {feature_dispersion_norm} # Set feats that are in bins with only one feat to have a norm dispersion of 1  
     ) %>%  
     dplyr::ungroup() %>%
-    dplyr::select(c(-bin_sd_is_na, -var, -bin_sd, -bin_mean)) %>%
-    dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats)
+    dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats, with_ties = FALSE)
+  feats_of_interest <- which(rownames(mat) %in% features_df$name) # get rownames to get original sorted order
   if (save_feat_selection) {
     # get rownames that are in features_df$name
-    feats_of_interest <- which(rownames(mat) %in% features_df$name)
     return(list(
       mat = mat[feats_of_interest,],
       feature_selection = features_df
     ))
   }
   return(mat[feats_of_interest,])
-  #return(mat[features_df$name,])
 }
 
                    

From dd801651258686fa71b357fdd5d25345ce4a5d5b Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Tue, 3 Dec 2024 15:56:25 -0800
Subject: [PATCH 020/142] [r] fix lsi docstring, idf_ logic

---
 r/R/singlecell_utils.R | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index b3d873db..cdee4537 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -242,7 +242,7 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
 #' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA.
 #' @param scale_factor (integer) Scale factor for the tf-idf log transform.
-#' #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
+#' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
 #' @param threads (integer) Number of threads to use.
 #' @return 
 #' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
@@ -270,10 +270,9 @@ lsi <- function(
 
   # log(tf-idf) transform
   mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
-  
   npeaks <- mat_stats$col_stats["mean",] * nrow(mat) 
   tf <- mat %>% multiply_cols(1 / npeaks)
-  idf_ <- ncol(mat) / (mat_stats$row_stats["mean",] * nrow(mat))
+  idf_ <- 1 / mat_stats$row_stats["mean",]
   mat_tfidf <- tf %>% multiply_rows(idf_)
   mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
   # Save to prevent re-calculation of queued operations

From 8891981744295f0687e92cf12c6d181a08349a8f Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 6 Dec 2024 16:32:58 -0800
Subject: [PATCH 021/142] [r] replace z-score norm with corr cutoffs

---
 r/R/singlecell_utils.R               | 31 ++++++++++++++++------------
 r/tests/real_data/ArchR_LSI.R        |  1 -
 r/tests/real_data/ArchR_insertions.R | 15 +++++++++++---
 3 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index cdee4537..28fa45b7 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -240,7 +240,8 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 #' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
 #' @param mat (IterableMatrix) dimensions features x cells
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
-#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA.
+#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
+#' the corr_cutoff, it will be excluded from the final PCA matrix.
 #' @param scale_factor (integer) Scale factor for the tf-idf log transform.
 #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
 #' @param threads (integer) Number of threads to use.
@@ -248,6 +249,7 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 #' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
 #' - If `save_lsi` is `TRUE`, return a list with the following elements:
 #'    - `pca_res`: dgCMatrix of shape `(n_dimensions, ncol(mat))`
+#'    - `unused_pcs`: Integer vector of PCs that were filtered out due to high correlation with sequencing depth
 #'    - `svd_attr`: List of SVD attributes
 #'    - `idf`: Inverse document frequency vector
 #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
@@ -257,7 +259,7 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 #' @export
 lsi <- function(
   mat,
-  n_dimensions = 50L, z_score_norm = TRUE,  scale_factor = 1e4,
+  n_dimensions = 50L, corr_cutoff = 1, scale_factor = 1e4,
   save_lsi = FALSE,
   threads = 1L
 ) {
@@ -266,12 +268,13 @@ lsi <- function(
   assert_len(n_dimensions, 1)
   assert_greater_than_zero(n_dimensions)
   assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
+  assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1))
   assert_is_wholenumber(threads)
 
   # log(tf-idf) transform
   mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
-  npeaks <- mat_stats$col_stats["mean",] * nrow(mat) 
-  tf <- mat %>% multiply_cols(1 / npeaks)
+  read_depth <- mat_stats$col_stats["mean",] * nrow(mat) 
+  tf <- mat %>% multiply_cols(1 / read_depth)
   idf_ <- 1 / mat_stats$row_stats["mean",]
   mat_tfidf <- tf %>% multiply_rows(idf_)
   mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
@@ -280,22 +283,24 @@ lsi <- function(
     convert_matrix_type(mat_log_tfidf, type = "float"), 
     tempfile("mat_log_tfidf"), compress = TRUE
   )
-  # Z-score normalization
-  if (z_score_norm) {
-    cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = threads)$col_stats
-    cell_means <- cell_peak_stats["mean",]
-    cell_vars <- cell_peak_stats["variance",]
-    mat_log_tfidf <- mat_log_tfidf %>%
-      add_cols(-cell_means) %>%
-      multiply_cols(1 / cell_vars)
-  }
   # Run pca
   svd_attr_ <- svds(mat_log_tfidf, k = n_dimensions, threads = threads)
   pca_res <- t(svd_attr_$u) %*% mat_log_tfidf
+  
+  # Filter out PCs that are highly correlated with sequencing depth
+  pca_corrs <- abs(cor(read_depth, t(pca_res)))
+  pca_feats_to_keep <- which(pca_corrs < corr_cutoff)
+  if (length(pca_feats_to_keep) != n_dimensions) {
+    # not sure if this is the route we want to take.  Should we just leave the PCs in,
+    # and not use them in downstream analysis?
+    pca_res <- t(svd_attr_$u[, pca_feats_to_keep]) %*% mat_log_tfidf
+  }
+
   if(save_lsi) {
     return(list(
       pca_res = pca_res,
       svd_attr = svd_attr_,
+      unused_pcs <- which(pca_corrs >= corr_cutoff),
       idf = idf_
     ))
   }
diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
index ff697b4c..084645d3 100644
--- a/r/tests/real_data/ArchR_LSI.R
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -55,7 +55,6 @@ test_lsi_similarity_to_archr <- function(dir = NULL) {
     # Do not use z-score normalization, as this isn't done with ArchR
     lsi_bpcells <- lsi(
         test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), 
-        z_score_norm = FALSE,
         n_dimensions = 20,
         save_lsi = TRUE
     )
diff --git a/r/tests/real_data/ArchR_insertions.R b/r/tests/real_data/ArchR_insertions.R
index 810133c1..8818d7e7 100644
--- a/r/tests/real_data/ArchR_insertions.R
+++ b/r/tests/real_data/ArchR_insertions.R
@@ -6,9 +6,18 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
-source("test_helpers.R")
-devtools::load_all(config[["path_bpcells"]])
-devtools::load_all(config[["path_archr"]])
+devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r")
+devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/ArchR")
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
 
 fix_granges_syntax_for_archr <- function(gr) {
     mcols(gr)$RG <- gsub("PBSmall#", "", mcols(gr)$RG)

From 1e7c6d062dcc50c40a1bfcd5344cb0835cd7889f Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 8 Jan 2025 17:30:41 -0800
Subject: [PATCH 022/142] [r] update LSI to use norm, feature selection helpers

---
 r/NAMESPACE                                   |   9 +-
 r/R/singlecell_utils.R                        | 426 ++++++++++--------
 r/R/transforms.R                              |  72 ---
 r/man/DimReduction.Rd                         |  21 +
 r/man/lsi.Rd                                  |  37 +-
 r/man/project.Rd                              |  25 +
 ...> select_features_by_binned_dispersion.Rd} |  21 +-
 r/pkgdown/_pkgdown.yml                        |   9 +-
 r/tests/real_data/ArchR_LSI.R                 |  17 +-
 r/tests/testthat/test-singlecell_utils.R      |  40 +-
 10 files changed, 340 insertions(+), 337 deletions(-)
 create mode 100644 r/man/DimReduction.Rd
 create mode 100644 r/man/project.Rd
 rename r/man/{highly_variable_features.Rd => select_features_by_binned_dispersion.Rd} (59%)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index c7e4fd1c..b296a27f 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -2,11 +2,16 @@
 
 S3method(base::as.data.frame,IterableFragments)
 S3method(base::as.matrix,IterableMatrix)
+S3method(project,DimReduction)
+S3method(project,LSI)
+S3method(project,default)
 S3method(svds,IterableMatrix)
 S3method(svds,default)
 export("all_matrix_inputs<-")
 export("cellNames<-")
 export("chrNames<-")
+export(DimReduction)
+export(LSI)
 export(add_cols)
 export(add_rows)
 export(all_matrix_inputs)
@@ -48,7 +53,6 @@ export(gene_score_archr)
 export(gene_score_tiles_archr)
 export(gene_score_weights_archr)
 export(get_trackplot_height)
-export(highly_variable_features)
 export(import_matrix_market)
 export(import_matrix_market_10x)
 export(knn_annoy)
@@ -56,7 +60,6 @@ export(knn_hnsw)
 export(knn_to_geodesic_graph)
 export(knn_to_snn_graph)
 export(log1p_slow)
-export(lsi)
 export(marker_features)
 export(match_gene_symbol)
 export(matrix_stats)
@@ -87,6 +90,7 @@ export(plot_tf_footprint)
 export(plot_tss_profile)
 export(plot_tss_scatter)
 export(prefix_cell_names)
+export(project)
 export(pseudobulk_matrix)
 export(qc_scATAC)
 export(range_distance_to_nearest)
@@ -112,6 +116,7 @@ export(rowVars.default)
 export(sctransform_pearson)
 export(select_cells)
 export(select_chromosomes)
+export(select_features_by_binned_dispersion)
 export(select_features_by_mean)
 export(select_features_by_variance)
 export(select_regions)
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 28fa45b7..c6baece6 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -118,6 +118,233 @@ select_features_by_mean <- function(mat, num_feats = 25000, threads = 1L) {
 }
 
 
+#' Get the most variable features within a matrix.
+#' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
+#' all features will be returned.
+#' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
+#' and if the number of features
+#' within a bin is less than 2, the dispersion is set to 1.
+#' @inheritParams select_features_by_variance
+#' @returns
+#' Return a dataframe with the following columns, sorted descending by bin-normalized dispersion:
+#'   - `names`: Feature name.
+#'   - `score`: Bin-normalized dispersion of the feature.
+#'   - `highly_variable`: Logical vector of whether the feature is highly variable.
+#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
+#' 
+#' Calculate using the following process:
+#'  1. Calculate the dispersion of each feature (variance / mean)
+#'  2. Log normalize dispersion and mean
+#'  3. Bin the features by their means, and normalize dispersion within each bin
+#' @export
+select_features_by_binned_dispersion <- function(
+  mat, num_feats = 25000, n_bins = 20,
+  threads = 1L
+) {
+  assert_is(mat, "IterableMatrix")
+  assert_greater_than_zero(num_feats)
+  assert_is_wholenumber(num_feats)
+  assert_len(num_feats, 1)
+  assert_is_wholenumber(n_bins)
+  assert_len(n_bins, 1)
+  assert_greater_than_zero(n_bins)
+  if (nrow(mat) <= num_feats) {
+    log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
+    features_df <- tibble::tibble(
+      names = rownames(mat),
+      score = rep(0, nrow(mat)),
+      highly_variable = rep(TRUE, nrow(mat))
+    )
+    return(mat)
+  }
+  # Calculate row information for dispersion
+  mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
+  feature_means <- mat_stats$row_stats["mean", ]
+  feature_vars <- mat_stats$row_stats["variance", ]
+  # Calculate dispersion, and log normalize
+  feature_dispersion <- feature_vars / feature_means
+  feature_dispersion[feature_dispersion == 0] <- NA
+  feature_dispersion <- log1p(feature_dispersion)
+  feature_dispersion[feature_means == 0] <- 0
+  feature_means <- log1p(feature_means)
+  features_df <- tibble::tibble(
+    names = names(feature_means),
+    var = feature_vars, 
+    mean = feature_means,
+    dispersion = feature_dispersion
+  )
+  # Bin by mean, and normalize dispersion with each bin
+  features_df <- features_df %>% 
+    dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% 
+    dplyr::group_by(bin) %>% 
+    dplyr::mutate( 
+      score = (dispersion - mean(dispersion)) / sd(dispersion),
+      score = if (dplyr::n() == 1) {1} else {score} # Set feats that are in bins with only one feat to have a norm dispersion of 1  
+    ) %>% 
+    dplyr::ungroup() %>%
+    dplyr::arrange(desc(score)) %>%
+    dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats) %>% 
+    dplyr::select(c("names", "score", "highly_variable"))
+  return(features_df)
+}
+
+
+#################
+# DimReduction Class Definition
+#################
+
+#' Barebones definition of a DimReduction class.
+#' 
+#' Represents a latent space output of a matrix after a transformation function, with any required information to reproject other inputs using this object.
+#' Child classes should implement a `project` method to allow for the projection of other matrices using
+#' the fitted transformation object.
+#' @field cell_embeddings (IterableMatrix, dgCMatrix, matrix) The projected data
+#' @field fitted_params (list) A list of parameters used for the transformation of a matrix.
+#' @export
+DimReduction <- function(x, fitted_params = list(), ...) {
+  assert_is(x, c("IterableMatrix", "dgCMatrix", "matrix"))
+  assert_is(fitted_params, "list")
+  structure(
+    list(
+      cell_embeddings = x,
+      fitted_params = fitted_params,
+      ...
+    ),
+    class = "DimReduction"
+  )
+}
+
+#' Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.
+#' @param x DimReduction object.
+#' @param mat IterableMatrix object.
+#' @return IterableMatrix object of the projected data.
+#' @details DimReduction subclasses should use this to project new data with the same features, to project into the same latent space.
+#' All required information to run a projection should be held in x$fitted_params, including pertinent parameters when construction the DimReduction subclass object.
+#' If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features.
+#' If there are rownames, reorder the matrix to match the order of the original matrix
+#' @export
+project <- function(x, mat, ...) {
+  UseMethod("project")
+}
+#' @export 
+project.default <- function(x, mat, ...) {
+  rlang::abort("project method not implemented for BPCells objects.")
+}
+#' @export
+project.DimReduction <- function(x, mat, ...) {
+  rlang::abort("project method not implemented for base DimReduction object.")
+}
+
+#################
+# LSI Implementation
+#################
+
+
+#' Perform latent semantic indexing (LSI) on a matrix.
+#' 
+#' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
+#' @param mat (IterableMatrix) dimensions features x cells.
+#' @param n_dimensions (integer) Number of dimensions to keep during PCA.
+#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
+#' the corr_cutoff, it will be excluded from the final PCA matrix.
+#' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed.
+#' @param threads (integer) Number of threads to use.
+#' @returns An object of class `c("LSI", "DimReduction")` with the following attributes:
+#' - `cell_embeddings`: The projected data
+#' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
+#'   - `normalization_method`: The normalization method used
+#'   - `feature_means`: The means of the features used for normalization
+#'   - `pcs_to_keep`: The PCs that were kept after filtering by correlation to sequencing depth
+#'   - `svd_params`: The matrix calculated for SVD
+#' - `feature_names`: The names of the features in the matrix
+#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
+#' 
+#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
+#' - 17.1 MB memory usage, 25.1 seconds runtime
+#' @seealso `project()` `DimReduction()` `normalize_tfidf()`
+#' @export
+LSI <- function(
+  mat, n_dimensions = 50L, corr_cutoff = 1, normalize = normalize_tfidf,
+  threads = 1L
+) {
+  assert_is(mat, "IterableMatrix")
+  assert_is_wholenumber(n_dimensions)
+  assert_len(n_dimensions, 1)
+  assert_greater_than_zero(n_dimensions)
+  assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
+  assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1))
+  assert_is_wholenumber(threads)
+
+  # Normalization
+  mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"), threads = threads)
+  read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
+  if (!is.null(normalize)) mat <- normalize(mat, threads = threads)
+  
+  # Save to prevent re-calculation of queued operations
+  mat <- write_matrix_dir(
+    convert_matrix_type(mat, type = "float"),
+    tempfile("mat"), compress = TRUE
+  )
+  # Run pca
+  svd_attr <- svds(mat, k = n_dimensions, threads = threads)
+  pca_res <- t(svd_attr$u) %*% mat
+  
+  # Filter out PCs that are highly correlated with sequencing depth
+  pca_corrs <- abs(cor(read_depth, t(pca_res)))
+  pca_feats_to_keep <- which(pca_corrs < corr_cutoff)
+  if (length(pca_feats_to_keep) != n_dimensions) {
+    log_progress(sprintf("Dropping PCs %s due to high correlation with sequencing depth", paste(setdiff(1:n_dimensions, pca_feats_to_keep), collapse = ", ")))
+    pca_res <- pca_res[pca_feats_to_keep, ]
+  }
+  fitted_params <- list(
+    normalization_method = normalize,
+    feature_means = mat_stats$row_stats["mean", ],
+    pcs_to_keep = pca_feats_to_keep,
+    svd_params = svd_attr
+  )
+  res <- DimReduction(
+    x = pca_res,
+    fitted_params = fitted_params,
+    feature_names = rownames(mat)
+  )
+  class(res) <- c("LSI", class(res))
+  return(res)
+}
+
+
+#' @export
+project.LSI <- function(x, mat, threads = 1L, ...) {
+  assert_is(mat, "IterableMatrix")
+  assert_is(x, "LSI")
+
+  fitted_params <- x$fitted_params
+  # Do a check to make sure that the number of rows in the matrix is the same as the number of rows in SVD$u
+  assert_true(nrow(mat) == nrow(fitted_params$svd_params$u))
+  if (!is.null(rownames(mat)) && !is.null(x$feature_names)) {
+    assert_true(all(x$feature_names %in% rownames(mat)))
+    mat <- mat[x$feature_names, ]
+  }
+
+  if (!is.null(fitted_params$normalization_method))  {
+    mat <- fitted_params$normalization_method(
+      mat,
+      feature_means = fitted_params$feature_means,
+      threads = threads
+    )
+    mat <- write_matrix_dir(
+      convert_matrix_type(mat, type = "float"),
+      tempfile("mat"), compress = TRUE
+    )
+  }
+  pca_attr <- fitted_params$svd_params
+  res <- t(pca_attr$u) %*% mat
+  if (length(fitted_params$pcs_to_keep) != nrow(res)) {
+    res <- res[fitted_params$pcs_to_keep, ]
+  }
+  return(res)
+}
+
+
 #' Test for marker features
 #'
 #' Given a features x cells matrix, perform one-vs-all differential
@@ -182,205 +409,6 @@ marker_features <- function(mat, groups, method="wilcoxon") {
     )
 }
 
-
-#' Aggregate counts matrices by cell group or feature.
-#'
-#' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each
-#' feature.
-#' @param cell_groups (Character/factor) Vector of group/cluster assignments for each cell. Length must be `ncol(mat)`.
-#' @param method (Character vector) Method(s) to aggregate counts. If one method is provided, the output will be a matrix. If multiple methods are provided, the output will be a named list of matrices.
-#'
-#' Current options are: `nonzeros`, `sum`, `mean`, `variance`.
-#' @param threads (integer) Number of threads to use.
-#' @return
-#'  - If `method` is length `1`, returns a matrix of shape `(features x groups)`.
-#'  - If `method` is greater than length `1`, returns a list of matrices with each matrix representing a pseudobulk matrix with a different aggregation method.
-#' Each matrix is of shape `(features x groups)`, and names are one of `nonzeros`, `sum`, `mean`, `variance`.
-#' @details Some simpler stats are calculated in the process of calculating more complex
-#' statistics. So when calculating `variance`, `nonzeros` and `mean` can be included with no
-#' extra calculation time, and when calculating `mean`, adding `nonzeros` will take no extra time.
-#' @inheritParams marker_features
-#' @export
-pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
-  assert_is(mat, "IterableMatrix")
-  assert_is(cell_groups, c("factor", "character", "numeric"))
-  assert_true(length(cell_groups) == ncol(mat))
-  cell_groups <- as.factor(cell_groups)
-  assert_is(method, "character")
-  methods <- c("variance", "mean", "sum", "nonzeros")
-  for (m in method) {
-    if (!(m %in% methods)) {
-      rlang::abort(sprintf("method must be one of: %s", paste(methods, collapse = ", ")))
-    }
-  }
-  assert_is(threads, "integer")
-  # if multiple methods are provided, only need to pass in the top method as it will also calculate the less complex stats
-  iter <- iterate_matrix(parallel_split(mat, threads, threads*4))
-  res <- pseudobulk_matrix_cpp(iter, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose)
-  # if res is a single matrix, return with colnames and rownames
-  if (length(method) == 1) {
-    colnames(res[[method]]) <- levels(cell_groups)
-    rownames(res[[method]]) <- rownames(mat)
-    return(res[[method]])
-  }
-  # give colnames and rownames for each matrix in res, which is a named list
-  for (res_slot in names(res)) {
-    if ((length(res[[res_slot]]) == 0) || !(res_slot %in% method)) {
-      res[[res_slot]] <- NULL
-    } else {
-      colnames(res[[res_slot]]) <- levels(cell_groups)
-      rownames(res[[res_slot]]) <- rownames(mat)
-    }
-  }
-  return(res)
-}
-
-#' Perform latent semantic indexing (LSI) on a matrix.
-#' 
-#' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
-#' @param mat (IterableMatrix) dimensions features x cells
-#' @param n_dimensions (integer) Number of dimensions to keep during PCA.
-#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
-#' the corr_cutoff, it will be excluded from the final PCA matrix.
-#' @param scale_factor (integer) Scale factor for the tf-idf log transform.
-#' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
-#' @param threads (integer) Number of threads to use.
-#' @return 
-#' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
-#' - If `save_lsi` is `TRUE`, return a list with the following elements:
-#'    - `pca_res`: dgCMatrix of shape `(n_dimensions, ncol(mat))`
-#'    - `unused_pcs`: Integer vector of PCs that were filtered out due to high correlation with sequencing depth
-#'    - `svd_attr`: List of SVD attributes
-#'    - `idf`: Inverse document frequency vector
-#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
-#' 
-#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
-#' - 17.1 MB memory usage, 25.1 seconds runtime
-#' @export
-lsi <- function(
-  mat,
-  n_dimensions = 50L, corr_cutoff = 1, scale_factor = 1e4,
-  save_lsi = FALSE,
-  threads = 1L
-) {
-  assert_is(mat, "IterableMatrix")
-  assert_is_wholenumber(n_dimensions)
-  assert_len(n_dimensions, 1)
-  assert_greater_than_zero(n_dimensions)
-  assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
-  assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1))
-  assert_is_wholenumber(threads)
-
-  # log(tf-idf) transform
-  mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
-  read_depth <- mat_stats$col_stats["mean",] * nrow(mat) 
-  tf <- mat %>% multiply_cols(1 / read_depth)
-  idf_ <- 1 / mat_stats$row_stats["mean",]
-  mat_tfidf <- tf %>% multiply_rows(idf_)
-  mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
-  # Save to prevent re-calculation of queued operations
-  mat_log_tfidf <- write_matrix_dir(
-    convert_matrix_type(mat_log_tfidf, type = "float"), 
-    tempfile("mat_log_tfidf"), compress = TRUE
-  )
-  # Run pca
-  svd_attr_ <- svds(mat_log_tfidf, k = n_dimensions, threads = threads)
-  pca_res <- t(svd_attr_$u) %*% mat_log_tfidf
-  
-  # Filter out PCs that are highly correlated with sequencing depth
-  pca_corrs <- abs(cor(read_depth, t(pca_res)))
-  pca_feats_to_keep <- which(pca_corrs < corr_cutoff)
-  if (length(pca_feats_to_keep) != n_dimensions) {
-    # not sure if this is the route we want to take.  Should we just leave the PCs in,
-    # and not use them in downstream analysis?
-    pca_res <- t(svd_attr_$u[, pca_feats_to_keep]) %*% mat_log_tfidf
-  }
-
-  if(save_lsi) {
-    return(list(
-      pca_res = pca_res,
-      svd_attr = svd_attr_,
-      unused_pcs <- which(pca_corrs >= corr_cutoff),
-      idf = idf_
-    ))
-  }
-  return(pca_res)
-}
-
-#' Get the most variable features within a matrix.
-#' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
-#' all features will be returned.
-#' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
-#' and if the number of features
-#' within a bin is less than 2, the dispersion is set to 1.
-#' @param save_feat_selection (logical) If `TRUE`, save the dispersions, means, and the features selected.
-#' @returns 
-#' - If `save_feat_selection` is `FALSE`, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
-#' - If `save_feat_selection` is `TRUE`, return a list with the following elements:
-#'    - `mat`: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
-#'    - `feature_selection`: Dataframe with columns `name`, `mean`, `dispersion`, `bin`, `feature_dispersion_norm`.
-#' @inheritParams lsi
-#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
-#' 
-#' Calculate using the following process:
-#'  1. Calculate the dispersion of each feature (variance / mean)
-#'  2. Log normalize dispersion and mean
-#'  3. Bin the features by their means, and normalize dispersion within each bin
-#' @export
-highly_variable_features <- function(
-  mat, num_feats, n_bins = 20,
-  save_feat_selection = FALSE,
-  threads = 1L
-) {
-  assert_is(mat, "IterableMatrix")
-  assert_greater_than_zero(num_feats)
-  assert_is_wholenumber(num_feats)
-  assert_len(num_feats, 1)
-  assert_is_wholenumber(n_bins)
-  assert_len(n_bins, 1)
-  assert_greater_than_zero(n_bins)
-  if (nrow(mat) <= num_feats) {
-    log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
-    return(mat)
-  }
-  # Calculate row information for dispersion
-  mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
-  feature_means <- mat_stats$row_stats["mean", ]
-  feature_vars <- mat_stats$row_stats["variance", ]
-  # Calculate dispersion, and log normalize
-  feature_dispersion <- feature_vars / feature_means
-  feature_dispersion[feature_dispersion == 0] <- NA
-  feature_dispersion <- log(feature_dispersion)
-  feature_dispersion[feature_means == 0] <- 0
-  feature_means <- log1p(feature_means)
-  features_df <- data.frame(
-    name = names(feature_means),
-    var = feature_vars, 
-    mean = feature_means,
-    dispersion = feature_dispersion
-  ) 
-
-  # Bin by mean, and normalize dispersion with each bin
-  features_df <- features_df %>% 
-    dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% 
-    dplyr::group_by(bin) %>% 
-    dplyr::mutate( 
-      feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion),
-      feature_dispersion_norm = if (dplyr::n() == 1) {1} else {feature_dispersion_norm} # Set feats that are in bins with only one feat to have a norm dispersion of 1  
-    ) %>%  
-    dplyr::ungroup() %>%
-    dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats, with_ties = FALSE)
-  feats_of_interest <- which(rownames(mat) %in% features_df$name) # get rownames to get original sorted order
-  if (save_feat_selection) {
-    # get rownames that are in features_df$name
-    return(list(
-      mat = mat[feats_of_interest,],
-      feature_selection = features_df
-    ))
-  }
-  return(mat[feats_of_interest,])
-}
-
                    
 #' Aggregate counts matrices by cell group or feature.
 #'
diff --git a/r/R/transforms.R b/r/R/transforms.R
index 7a110677..8a2bd25b 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -983,76 +983,4 @@ normalize_tfidf <- function(
   idf <- 1 / feature_means
   tf_idf_mat <- tf %>% multiply_rows(idf)
   return(log1p(tf_idf_mat * scale_factor))
-}
-#' Compute LSI For a peak matrix 
-#' @param mat PeakMatrix
-#' @param n_dimensions Number of dimensions to keep during PCA
-#' @param scale_factor Scale factor for the tf-idf log transform
-#' @param verbose Whether to print out progress
-#' @param threads Number of threads to use
-#' @return dgCMatrix of shape (n_dimensions, ncol(mat))
-#' @details Compute LSI through first doing a tf-idf transform, a z-score normalization, then PCA.  
-#' Tf-idf implementation is from Stuart & Butler et al. 2019.  
-#' @export
-compute_lsi <- function(mat, n_dimensions = 50, scale_factor = 1e-4, verbose = FALSE, threads = 1) {
-  assert_is(mat, "IterableMatrix") # Should be a peak matrix, should we enforce this?
-  assert_is(n_dimensions, "integer")
-  assert_len(n_dimensions, 1)
-  assert_greater_than_zero(n_dimensions)
-  assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
-
-  # Signac implementation
-  npeaks <- colSums(mat)
-  tf <- mat %>% multiply_cols(1/npeaks)
-  idf_ <- ncol(mat) / rowSums(mat)
-  mat_tfidf <- tf %>% multiply_rows(idf_)
-  mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
-
-  # run z-score norm
-  cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance")$col_stats
-  cell_means <- cell_peak_stats["mean",]
-  cell_vars <- cell_peak_stats["variance",]
-  mat_lsi_norm <- mat_log_tfidf %>%
-    add_cols(-cell_means) %>%
-    multiply_cols(1 / cell_vars)
-
-  # Run pca
-  svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions)
-  pca_res <- t(svd_attr$u) %*% mat_lsi_norm
-  return(pca_res)
-}
-
-#' Get most variable features, given a non-log normalized matrix
-highly_variable_features <- function(mat, num_feats, n_bins, verbose = FALSE) {
-  assert_is(mat, "IterableMatrix")
-  assert_is(num_feats, "integer")
-  assert_greater_than_zero(num_feats)
-  assert_len(num_feats, 1)
-  assert_is(n_bins, "integer")
-  assert_len(n_bins, 1)
-  assert_greater_than_zero(n_bins)
-  if (nrow(mat) <= num_feats) {
-    if (verbose) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), 
-                                       returning all features", nrow(mat), num_feats))
-    return(mat)
-  }
-  # Calculate the mean and variance of each feature
-  # should we set entries that are 0 to NA?
-  feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats['mean', ]
-  feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats['variance', ]
-  feature_dispersion <- feature_vars / feature_means
-  feature_dispersion <- log(feature_dispersion)
-  feature_means <- log1p(feature_means)
-  mean_bins <- cut(feature_means, n_bins, labels = FALSE)
-
-  # Calculate the mean and variance of dispersion of each bin
-  bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x))
-  bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x))
-  # Set bin_sd value to bin_mean if bin_sd is NA, results in norm dispersion of 1
-  bin_sd[is.na(bin_sd)] <- bin_mean[is.na(bin_sd)]
-  # map mean_bins indices to bin_stats
-  feature_dispersion_norm <- (feature_dispersion - bin_mean[mean_bins]) / bin_sd[mean_bins]
-  names(feature_dispersion_norm) <- names(feature_dispersion)
-  variable_features_ <- sort(feature_dispersion_norm)[nrow(mat)-num_feats:nrow(mat)]
-  return(mat[names(variable_features_), ])
 }
\ No newline at end of file
diff --git a/r/man/DimReduction.Rd b/r/man/DimReduction.Rd
new file mode 100644
index 00000000..2bb7486c
--- /dev/null
+++ b/r/man/DimReduction.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{DimReduction}
+\alias{DimReduction}
+\title{Barebones definition of a DimReduction class.}
+\usage{
+DimReduction(x, fitted_params = list(), ...)
+}
+\description{
+Represents a latent space output of a matrix after a transformation function, with any required information to reproject other inputs using this object.
+Child classes should implement a \code{project} method to allow for the projection of other matrices using
+the fitted transformation object.
+}
+\section{Fields}{
+
+\describe{
+\item{\code{cell_embeddings}}{(IterableMatrix, dgCMatrix, matrix) The projected data}
+
+\item{\code{fitted_params}}{(list) A list of parameters used for the transformation of a matrix.}
+}}
+
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
index ea2687f9..13348e23 100644
--- a/r/man/lsi.Rd
+++ b/r/man/lsi.Rd
@@ -1,39 +1,41 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/singlecell_utils.R
-\name{lsi}
-\alias{lsi}
+\name{LSI}
+\alias{LSI}
 \title{Perform latent semantic indexing (LSI) on a matrix.}
 \usage{
-lsi(
+LSI(
   mat,
-  z_score_norm = TRUE,
   n_dimensions = 50L,
-  scale_factor = 10000,
-  save_lsi = FALSE,
+  corr_cutoff = 1,
+  normalize = normalize_tfidf,
   threads = 1L
 )
 }
 \arguments{
-\item{mat}{(IterableMatrix) dimensions features x cells}
-
-\item{z_score_norm}{(logical) If \code{TRUE}, z-score normalize the matrix before PCA.}
+\item{mat}{(IterableMatrix) dimensions features x cells.}
 
 \item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
 
-\item{scale_factor}{(integer) Scale factor for the tf-idf log transform.
-#' @param save_lsi (logical) If \code{TRUE}, save the SVD attributes for the matrix, as well as the idf normalization vector.}
+\item{corr_cutoff}{(numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
+the corr_cutoff, it will be excluded from the final PCA matrix.}
+
+\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
 
 \item{threads}{(integer) Number of threads to use.}
 }
 \value{
+An object of class \code{c("LSI", "DimReduction")} with the following attributes:
 \itemize{
-\item If \code{save_lsi} is \code{FALSE}, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}.
-\item If \code{save_lsi} is \code{TRUE}, return a list with the following elements:
+\item \code{cell_embeddings}: The projected data
+\item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 \itemize{
-\item \code{pca_res}: dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}
-\item \code{svd_attr}: List of SVD attributes
-\item \code{idf}: Inverse document frequency vector
+\item \code{normalization_method}: The normalization method used
+\item \code{feature_means}: The means of the features used for normalization
+\item \code{pcs_to_keep}: The PCs that were kept after filtering by correlation to sequencing depth
+\item \code{svd_params}: The matrix calculated for SVD
 }
+\item \code{feature_names}: The names of the features in the matrix
 }
 }
 \description{
@@ -47,3 +49,6 @@ Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
 \item 17.1 MB memory usage, 25.1 seconds runtime
 }
 }
+\seealso{
+\code{project()} \code{DimReduction()} \code{normalize_tfidf()}
+}
diff --git a/r/man/project.Rd b/r/man/project.Rd
new file mode 100644
index 00000000..dcab31b5
--- /dev/null
+++ b/r/man/project.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{project}
+\alias{project}
+\title{Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.}
+\usage{
+project(x, mat, ...)
+}
+\arguments{
+\item{x}{DimReduction object.}
+
+\item{mat}{IterableMatrix object.}
+}
+\value{
+IterableMatrix object of the projected data.
+}
+\description{
+Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.
+}
+\details{
+DimReduction subclasses should use this to project new data with the same features, to project into the same latent space.
+All required information to run a projection should be held in x$fitted_params, including pertinent parameters when construction the DimReduction subclass object.
+If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features.
+If there are rownames, reorder the matrix to match the order of the original matrix
+}
diff --git a/r/man/highly_variable_features.Rd b/r/man/select_features_by_binned_dispersion.Rd
similarity index 59%
rename from r/man/highly_variable_features.Rd
rename to r/man/select_features_by_binned_dispersion.Rd
index 005a4e37..b3c573ae 100644
--- a/r/man/highly_variable_features.Rd
+++ b/r/man/select_features_by_binned_dispersion.Rd
@@ -1,14 +1,13 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/singlecell_utils.R
-\name{highly_variable_features}
-\alias{highly_variable_features}
+\name{select_features_by_binned_dispersion}
+\alias{select_features_by_binned_dispersion}
 \title{Get the most variable features within a matrix.}
 \usage{
-highly_variable_features(
+select_features_by_binned_dispersion(
   mat,
-  num_feats,
+  num_feats = 25000,
   n_bins = 20,
-  save_feat_selection = FALSE,
   threads = 1L
 )
 }
@@ -22,18 +21,14 @@ all features will be returned.}
 and if the number of features
 within a bin is less than 2, the dispersion is set to 1.}
 
-\item{save_feat_selection}{(logical) If \code{TRUE}, save the dispersions, means, and the features selected.}
-
 \item{threads}{(integer) Number of threads to use.}
 }
 \value{
+Return a dataframe with the following columns, sorted descending by bin-normalized dispersion:
 \itemize{
-\item If \code{save_feat_selection} is \code{FALSE}, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
-\item If \code{save_feat_selection} is \code{TRUE}, return a list with the following elements:
-\itemize{
-\item \code{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
-\item \code{feature_selection}: Dataframe with columns \code{name}, \code{mean}, \code{dispersion}, \code{bin}, \code{feature_dispersion_norm}.
-}
+\item \code{names}: Feature name.
+\item \code{score}: Bin-normalized dispersion of the feature.
+\item \code{highly_variable}: Logical vector of whether the feature is highly variable.
 }
 }
 \description{
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 3cf6509a..64f1f119 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -135,9 +135,9 @@ reference:
   - normalize_tfidf
   - select_features_by_variance
   - select_features_by_dispersion
+  - select_features_by_binned_dispersion
   - select_features_by_mean
-  - lsi
-  - highly_variable_features
+  - LSI
   - IterableMatrix-methods
   - pseudobulk_matrix
 
@@ -156,6 +156,11 @@ reference:
   - knn_to_graph
   - cluster_membership_matrix
 
+- title: "Dimensionality Reductions"
+- contents: 
+  - DimReduction
+  - project
+
 
 - title: "Plots"
 
diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
index 084645d3..de775f4f 100644
--- a/r/tests/real_data/ArchR_LSI.R
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -36,7 +36,7 @@ test_lsi_similarity_to_archr <- function(dir = NULL) {
     # Calculate LSI on ArchR
     # running LSI without binarizing, as we don't do this in the BPCells implementation
     # we also don't filter quantile outliers.
-    lsi_archr <- .computeLSI(
+    lsi_archr <- ArchR:::.computeLSI(
         mat = test_mat,
         LSIMethod = 2,
         nDimensions = 20,
@@ -53,18 +53,17 @@ test_lsi_similarity_to_archr <- function(dir = NULL) {
     pre_svd_mat_approx_archr <- lsi_archr$svd$u %*% lsi_mat_archr
     # Calculate LSI on BPCells
     # Do not use z-score normalization, as this isn't done with ArchR
-    lsi_bpcells <- lsi(
-        test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), 
-        n_dimensions = 20,
-        save_lsi = TRUE
+    lsi_bpcells <- LSI(
+        test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"),
+        n_dimensions = 20
     )
-    pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res
+    pre_svd_mat_approx_bpcells <- lsi_bpcells$fitted_params$svd_params$u %*% lsi_bpcells$cell_embeddings
     testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-4))
     # convert signs
-    lsi_mat_archr <- sweep(lsi_mat_archr, MARGIN = 1,  (2 * (lsi_mat_archr[,1] * lsi_bpcells$pca_res[,1] > 0) - 1), `*`)
+    lsi_mat_archr <- sweep(lsi_mat_archr, MARGIN = 1,  (2 * (lsi_mat_archr[,1] * lsi_bpcells$cell_embeddings[,1] > 0) - 1), `*`)
     # Check for post-pca matrix similarity
-    testthat::expect_true(all.equal(lsi_mat_archr, lsi_bpcells$pca_res, tolerance = 1e-4))
+    testthat::expect_true(all.equal(lsi_mat_archr, lsi_bpcells$cell_embeddings, tolerance = 1e-4))
     # also check for correlation between the two matrices in PC space
-    testthat::expect_true(cor(as.vector(lsi_mat_archr), as.vector(lsi_bpcells$pca_res)) > 0.999)
+    testthat::expect_true(cor(as.vector(lsi_mat_archr), as.vector(lsi_bpcells$cell_embeddings)) > 0.999)
 }
 test_lsi_similarity_to_archr()
\ No newline at end of file
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index c699d67b..93c62a72 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -36,25 +36,6 @@ test_that("select_features works general case", {
 })
 
 
-test_that("select_features works general case", {
-  m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix")
-  for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) {
-    res <- do.call(fn, list(m1, num_feats = 10))
-    expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting
-    expect_equal(sum(res$highly_variable), 10) # Only 10 features marked as highly variable
-    expect_setequal(res$names, rownames(m1))
-    res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows
-    res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) 
-    expect_identical(res_more_feats_than_rows, res_feats_equal_rows)
-    if (fn != "select_features_by_mean") {
-      # Check that normalization actually does something
-      res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL))
-      expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score))
-    }
-  }
-})
-
-
 test_that("Wilcoxon rank sum works (small)", {
     x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46)
     y <- c(1.15, 0.88, 0.90, 0.74, 1.21)
@@ -206,11 +187,16 @@ test_that("Pseudobulk aggregation works with multiple return types", {
 
 
 
-test_that("Highly variable feature selection works", {
+test_that("Feature selection by bin variance works", {
     mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix")
     # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
-    res <- highly_variable_features(mat, num_feats = 10, n_bins = 5, threads = 1)
-    res_t <- highly_variable_features(t(mat), num_feats = 10, n_bins = 5, threads = 1)
+    res_table <- select_features_by_bin_variance(mat, num_feats = 10, n_bins = 5, threads = 1)
+    res_table_t <- select_features_by_bin_variance(t(mat), num_feats = 10, n_bins = 5, threads = 1)
+    browser()
+    res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) 
+    res <- mat[res_feats,]
+    res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(names)
+    res_t <- t(mat[,res_feats_t])
     expect_equal(nrow(res), 10)
     expect_equal(ncol(res), 26)
     expect_equal(nrow(res_t), 10)
@@ -222,10 +208,16 @@ test_that("LSI works", {
     rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
     colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
     # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to ArchR
-    lsi_res <- lsi(mat, n_dimensions = 5)
-    lsi_res_t <- lsi(t(mat), n_dimensions = 5)
+    lsi_res_obj <- LSI(mat, n_dimensions = 5)
+    lsi_res_t_obj <- LSI(t(mat), n_dimensions = 5)
+    lsi_res <- lsi_res_obj$cell_embeddings
+    lsi_res_t <- lsi_res_t_obj$cell_embeddings
+    # Check that projection results in the same output if used on the same input matrix
+    lsi_res_proj <- project(lsi_res_obj, mat)
+
     expect_equal(nrow(lsi_res), 5)
     expect_equal(ncol(lsi_res), ncol(mat))
     expect_equal(nrow(lsi_res_t), 5)
     expect_equal(ncol(lsi_res_t), nrow(mat))
+    expect_equal(lsi_res, lsi_res_proj)
 })
\ No newline at end of file

From e9c302e5e9b601722659b34b566584e1ae8191e6 Mon Sep 17 00:00:00 2001
From: Immanuel Abdi <immanuelazn@gmail.com>
Date: Thu, 9 Jan 2025 22:25:46 -0800
Subject: [PATCH 023/142] [r] update `NEWS.md`

---
 r/NEWS.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/r/NEWS.md b/r/NEWS.md
index 24b6b0d9..86d75556 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -24,9 +24,8 @@ Contributions welcome :)
 - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
 - Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168)
 - Add feature selection functions `select_features_by_{variance,dispersion,mean}()`, with parameterization for normalization steps, and number of variable features (pull request #169)
-- Add MACS2/3 input creation and peak calling through `call_macs_peaks()` (pull request #118)
-- Add `lsi()` function to perform latent semantic indexing on a matrix (pull request #156).
-- Add `highly_variable_features()` function to identify highly variable features in a matrix (pull request #156). 
+- Add `lsi()` function to perform latent semantic indexing on a matrix (pull request #181).
+- Add `highly_variable_features()` function to identify highly variable features in a matrix (pull request #181). 
 
 ## Improvements
 - `trackplot_loop()` now accepts discrete color scales

From 7ed6bd7d59074faaef6016a431acff31e3d5a2ba Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 10 Jan 2025 02:32:12 -0800
Subject: [PATCH 024/142] [r] remove test artifacts

---
 r/tests/testthat/test-singlecell_utils.R | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 93c62a72..85416f71 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -192,11 +192,11 @@ test_that("Feature selection by bin variance works", {
     # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
     res_table <- select_features_by_bin_variance(mat, num_feats = 10, n_bins = 5, threads = 1)
     res_table_t <- select_features_by_bin_variance(t(mat), num_feats = 10, n_bins = 5, threads = 1)
-    browser()
     res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) 
     res <- mat[res_feats,]
     res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(names)
     res_t <- t(mat[,res_feats_t])
+    
     expect_equal(nrow(res), 10)
     expect_equal(ncol(res), 26)
     expect_equal(nrow(res_t), 10)
@@ -220,4 +220,5 @@ test_that("LSI works", {
     expect_equal(nrow(lsi_res_t), 5)
     expect_equal(ncol(lsi_res_t), nrow(mat))
     expect_equal(lsi_res, lsi_res_proj)
-})
\ No newline at end of file
+})
+

From 199ae82355991e882de343128abdf62b99bc504b Mon Sep 17 00:00:00 2001
From: Ben Parks <bnprks+git@gmail.com>
Date: Fri, 10 Jan 2025 16:51:00 -0800
Subject: [PATCH 025/142] Update docs

---
 r/R/transforms.R         | 28 ++++++++++++++-----------
 r/man/normalize.Rd       | 45 ++++++++++++++++++++++++++++++++++++++++
 r/man/normalize_log.Rd   | 23 --------------------
 r/man/normalize_tfidf.Rd | 27 ------------------------
 r/pkgdown/_pkgdown.yml   | 22 +++++++++++---------
 5 files changed, 73 insertions(+), 72 deletions(-)
 create mode 100644 r/man/normalize.Rd
 delete mode 100644 r/man/normalize_log.Rd
 delete mode 100644 r/man/normalize_tfidf.Rd

diff --git a/r/R/transforms.R b/r/R/transforms.R
index 8a2bd25b..e79c404d 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -928,13 +928,19 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 # Normalizations
 #################
 
-#' Normalize a `(features x cells)` matrix using log normalization.
-#' @param mat (IterableMatrix) Matrix to normalize.
+#' Normalization recipes
+#' 
+#' Apply standard normalizations to a `(features x cells)` counts matrix.
+#' 
+#' @rdname normalize
+#' @param mat (IterableMatrix) Counts matrix to normalize. `(features x cells)`
 #' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization.
-#' @param threads (integer) Number of threads to use.s
-#' @returns log normalized matrix.  For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, 
-#'  the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
-#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
+#' @param threads (integer) Number of threads to use.
+#' @returns For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, 
+#'  transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as:
+#'  
+#'   - `normalize_log`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
+#' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog`
 #' @export
 normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
   assert_is(mat, "IterableMatrix")
@@ -946,14 +952,12 @@ normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
 }
 
 
-#' Normalize a `(features x cells)` matrix using term frequency-inverse document frequency.
-#' @param feature_means (numeric) Means of the features to normalize by.  If no names are provided, then 
+#' @rdname normalize
+#' @param feature_means (numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then 
 #' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
 #' Else, map each feature name to its mean value.
-#' @returns tf-idf normalized matrix.  For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
-#' the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
-#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
-#' @inheritParams normalize_log
+#' @returns - `normalize_tfidf`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
+#' @details - `normalize_tfidf`: This follows the formula from Stuart, Butler et al. 2019, used by default in `ArchR::addIterativeLSI()` and `Signac::RunTFIDF()`
 #' @export
 normalize_tfidf <- function(
   mat, feature_means = NULL,
diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd
new file mode 100644
index 00000000..f4bde193
--- /dev/null
+++ b/r/man/normalize.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transforms.R
+\name{normalize_log}
+\alias{normalize_log}
+\alias{normalize_tfidf}
+\title{Normalization recipes}
+\usage{
+normalize_log(mat, scale_factor = 10000, threads = 1L)
+
+normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L)
+}
+\arguments{
+\item{mat}{(IterableMatrix) Counts matrix to normalize. \verb{(features x cells)}}
+
+\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.}
+
+\item{threads}{(integer) Number of threads to use.}
+
+\item{feature_means}{(numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then
+each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
+Else, map each feature name to its mean value.}
+}
+\value{
+For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
+transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as:
+\itemize{
+\item \code{normalize_log}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
+}
+
+\itemize{
+\item \code{normalize_tfidf}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
+}
+}
+\description{
+Apply standard normalizations to a \verb{(features x cells)} counts matrix.
+}
+\details{
+\itemize{
+\item \code{normalize_log}: Corresponds to \code{Seurat::NormalizeLog}
+}
+
+\itemize{
+\item \code{normalize_tfidf}: This follows the formula from Stuart, Butler et al. 2019, used by default in \code{ArchR::addIterativeLSI()} and \code{Signac::RunTFIDF()}
+}
+}
diff --git a/r/man/normalize_log.Rd b/r/man/normalize_log.Rd
deleted file mode 100644
index 97d8c92e..00000000
--- a/r/man/normalize_log.Rd
+++ /dev/null
@@ -1,23 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/transforms.R
-\name{normalize_log}
-\alias{normalize_log}
-\title{Normalize a \verb{(features x cells)} matrix using log normalization.}
-\usage{
-normalize_log(mat, scale_factor = 10000, threads = 1L)
-}
-\arguments{
-\item{mat}{(IterableMatrix) Matrix to normalize.}
-
-\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.}
-
-\item{threads}{(integer) Number of threads to use.s}
-}
-\value{
-log normalized matrix.  For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
-the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
-\eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
-}
-\description{
-Normalize a \verb{(features x cells)} matrix using log normalization.
-}
diff --git a/r/man/normalize_tfidf.Rd b/r/man/normalize_tfidf.Rd
deleted file mode 100644
index 8dc50b84..00000000
--- a/r/man/normalize_tfidf.Rd
+++ /dev/null
@@ -1,27 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/transforms.R
-\name{normalize_tfidf}
-\alias{normalize_tfidf}
-\title{Normalize a \verb{(features x cells)} matrix using term frequency-inverse document frequency.}
-\usage{
-normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L)
-}
-\arguments{
-\item{mat}{(IterableMatrix) Matrix to normalize.}
-
-\item{feature_means}{(numeric) Means of the features to normalize by.  If no names are provided, then
-each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
-Else, map each feature name to its mean value.}
-
-\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.}
-
-\item{threads}{(integer) Number of threads to use.s}
-}
-\value{
-tf-idf normalized matrix.  For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
-the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
-\eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
-}
-\description{
-Normalize a \verb{(features x cells)} matrix using term frequency-inverse document frequency.
-}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index ae2772eb..52d03343 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -131,20 +131,14 @@ reference:
   - checksum
   - apply_by_row
   - regress_out
-  - normalize_log
-  - normalize_tfidf
   - IterableMatrix-methods
   - pseudobulk_matrix
 
-- title: "Reference Annotations"
+- title: "Single-cell analysis helpers"
+- subtitle: "Dimensionality reduction"
 - contents:
-  - human_gene_mapping
-  - match_gene_symbol
-  - read_gtf
-  - read_bed
-  - read_ucsc_chrom_sizes
-
-- title: "Clustering"
+  - normalize_log
+- subtitle: "Clustering"
 - contents:
   - knn_hnsw
   - cluster_graph_leiden
@@ -182,3 +176,11 @@ reference:
   - discrete_palette
   - collect_features
   - rotate_x_labels
+
+- title: "Reference Annotations"
+- contents:
+  - human_gene_mapping
+  - match_gene_symbol
+  - read_gtf
+  - read_bed
+  - read_ucsc_chrom_sizes

From 553f262a604e4f366089ff9741e612b8f47821fe Mon Sep 17 00:00:00 2001
From: Ben Parks <bnprks+git@gmail.com>
Date: Fri, 10 Jan 2025 16:51:58 -0800
Subject: [PATCH 026/142] Update NEWS

---
 r/NEWS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/r/NEWS.md b/r/NEWS.md
index 07890a0e..ac2109fb 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -10,6 +10,7 @@ Contributions welcome :)
 
 ## Features
 - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166)
+- Add normalization helper functions `normalize_log()` and `normalize_tfidf()` (pull request #168)
 
 ## Bug-fixes
 - Fix error message printing when MACS crashes during `call_peaks_macs()` (pull request #175) 

From 7511f0b7e32b7ee7a726ddd73912ac272f3e0d79 Mon Sep 17 00:00:00 2001
From: Ben Parks <bnprks+git@gmail.com>
Date: Fri, 10 Jan 2025 17:00:53 -0800
Subject: [PATCH 027/142] Update docs

---
 r/R/transforms.R   | 2 +-
 r/man/normalize.Rd | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/r/R/transforms.R b/r/R/transforms.R
index e79c404d..05759f9e 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -928,7 +928,7 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 # Normalizations
 #################
 
-#' Normalization recipes
+#' Normalization helper functions
 #' 
 #' Apply standard normalizations to a `(features x cells)` counts matrix.
 #' 
diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd
index f4bde193..ac53f2c0 100644
--- a/r/man/normalize.Rd
+++ b/r/man/normalize.Rd
@@ -3,7 +3,7 @@
 \name{normalize_log}
 \alias{normalize_log}
 \alias{normalize_tfidf}
-\title{Normalization recipes}
+\title{Normalization helper functions}
 \usage{
 normalize_log(mat, scale_factor = 10000, threads = 1L)
 

From d67b7db51f7dba0eae8ecb1b02463042780a8542 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 13 Jan 2025 13:38:25 -0800
Subject: [PATCH 028/142] [r] add logging, partial args

---
 r/R/singlecell_utils.R                   | 17 ++++++++++++++---
 r/tests/testthat/test-singlecell_utils.R |  7 +++++--
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index c6baece6..6ec76e37 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -265,8 +265,16 @@ project.DimReduction <- function(x, mat, ...) {
 #' @export
 LSI <- function(
   mat, n_dimensions = 50L, corr_cutoff = 1, normalize = normalize_tfidf,
-  threads = 1L
+  threads = 1L, verbose = FALSE
 ) {
+  if (rlang::is_missing(mat)) {
+    return(
+      purrr::partial(
+        LSI, n_dimensions = n_dimensions, corr_cutoff = corr_cutoff, 
+        normalize = normalize, threads = threads, verbose = verbose
+      )
+    )
+  }
   assert_is(mat, "IterableMatrix")
   assert_is_wholenumber(n_dimensions)
   assert_len(n_dimensions, 1)
@@ -274,8 +282,10 @@ LSI <- function(
   assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
   assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1))
   assert_is_wholenumber(threads)
-
+  
+  if (verbose) log_progress("Starting LSI")
   # Normalization
+  if (verbose) log_progress("Normalizing matrix")
   mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"), threads = threads)
   read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
   if (!is.null(normalize)) mat <- normalize(mat, threads = threads)
@@ -286,6 +296,7 @@ LSI <- function(
     tempfile("mat"), compress = TRUE
   )
   # Run pca
+  if (verbose) log_progress("Calculating SVD")
   svd_attr <- svds(mat, k = n_dimensions, threads = threads)
   pca_res <- t(svd_attr$u) %*% mat
   
@@ -293,7 +304,7 @@ LSI <- function(
   pca_corrs <- abs(cor(read_depth, t(pca_res)))
   pca_feats_to_keep <- which(pca_corrs < corr_cutoff)
   if (length(pca_feats_to_keep) != n_dimensions) {
-    log_progress(sprintf("Dropping PCs %s due to high correlation with sequencing depth", paste(setdiff(1:n_dimensions, pca_feats_to_keep), collapse = ", ")))
+    if (verbose) log_progress(sprintf("Dropping PCs %s due to high correlation with sequencing depth", paste(setdiff(1:n_dimensions, pca_feats_to_keep), collapse = ", ")))
     pca_res <- pca_res[pca_feats_to_keep, ]
   }
   fitted_params <- list(
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 85416f71..034d765a 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -190,8 +190,8 @@ test_that("Pseudobulk aggregation works with multiple return types", {
 test_that("Feature selection by bin variance works", {
     mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix")
     # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
-    res_table <- select_features_by_bin_variance(mat, num_feats = 10, n_bins = 5, threads = 1)
-    res_table_t <- select_features_by_bin_variance(t(mat), num_feats = 10, n_bins = 5, threads = 1)
+    res_table <- select_features_by_binned_dispersion(mat, num_feats = 10, n_bins = 5, threads = 1)
+    res_table_t <- select_features_by_binned_dispersion(t(mat), num_feats = 10, n_bins = 5, threads = 1)
     res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) 
     res <- mat[res_feats,]
     res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(names)
@@ -210,6 +210,8 @@ test_that("LSI works", {
     # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to ArchR
     lsi_res_obj <- LSI(mat, n_dimensions = 5)
     lsi_res_t_obj <- LSI(t(mat), n_dimensions = 5)
+    # Also check partial args
+    lsi_res_obj_partial <- LSI(n_dimensions = 5)(mat)
     lsi_res <- lsi_res_obj$cell_embeddings
     lsi_res_t <- lsi_res_t_obj$cell_embeddings
     # Check that projection results in the same output if used on the same input matrix
@@ -219,6 +221,7 @@ test_that("LSI works", {
     expect_equal(ncol(lsi_res), ncol(mat))
     expect_equal(nrow(lsi_res_t), 5)
     expect_equal(ncol(lsi_res_t), nrow(mat))
+    expect_equal(lsi_res_obj, lsi_res_obj_partial)
     expect_equal(lsi_res, lsi_res_proj)
 })
 

From 435724b4dbccdd9fefa89778a045f58b1b360bd7 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 13 Jan 2025 16:46:14 -0800
Subject: [PATCH 029/142] [r] add partial args to normalizations

---
 r/R/transforms.R                          | 17 +++++++++++++++++
 r/tests/testthat/test-matrix_transforms.R |  3 +++
 2 files changed, 20 insertions(+)

diff --git a/r/R/transforms.R b/r/R/transforms.R
index 05759f9e..b097e898 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -943,6 +943,14 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog`
 #' @export
 normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
+  if (rlang::is_missing(mat)) {
+    return(
+      purrr::partial(
+        normalize_log, 
+        scale_factor = scale_factor, threads = threads
+      )
+    )
+  }
   assert_is(mat, "IterableMatrix")
   assert_is_numeric(scale_factor)
   assert_greater_than_zero(scale_factor)
@@ -963,6 +971,15 @@ normalize_tfidf <- function(
   mat, feature_means = NULL,
   scale_factor = 1e4, threads = 1L
 ) {
+  if (rlang::is_missing(mat)) {
+    return(
+      purrr::partial(
+        normalize_tfidf, 
+        feature_means = feature_means, scale_factor = scale_factor, 
+        threads = threads
+      )
+    )
+  }
   assert_is(mat, "IterableMatrix")
   assert_is_wholenumber(threads)
   # If feature means are passed in, only need to calculate term frequency
diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R
index 67641e54..24cd9c23 100644
--- a/r/tests/testthat/test-matrix_transforms.R
+++ b/r/tests/testthat/test-matrix_transforms.R
@@ -367,6 +367,8 @@ test_that("tf-idf normalization works", {
     res <- normalize_tfidf(m2)
     expect_equal(res %>% as("dgCMatrix"), res_dgc, tolerance = 1e-6)
     res_with_row_means <- normalize_tfidf(m2, feature_means = row_means)
+    res_with_row_means_partial <- normalize_tfidf(feature_means = row_means)(m2)
+    expect_equal(res_with_row_means, res_with_row_means_partial)
     expect_identical(res, res_with_row_means)
 
     res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled)
@@ -386,5 +388,6 @@ test_that("normalize_log works", {
     
     # Test that changing scale factor works
     res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix")
+    res_2_partial <- as(normalize_log(scale_factor = 1e5)(m2), "dgCMatrix")
     expect_equal(res_2, log1p(res_dgc*1e5), tolerance = 1e-6)
 })
\ No newline at end of file

From 8dbe8e52be1d5e79aa895deeb095fd80314c980c Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 13 Jan 2025 19:18:45 -0800
Subject: [PATCH 030/142] [r] create mechanism for partial calls on explicit
 args

---
 r/R/transforms.R | 13 ++++++-------
 r/R/utils.R      | 16 ++++++++++++++++
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/r/R/transforms.R b/r/R/transforms.R
index b097e898..02ab51b8 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -943,11 +943,11 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog`
 #' @export
 normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
+  # browser()
   if (rlang::is_missing(mat)) {
     return(
-      purrr::partial(
-        normalize_log, 
-        scale_factor = scale_factor, threads = threads
+      partial_explicit(
+        normalize_log, scale_factor = scale_factor, threads = threads
       )
     )
   }
@@ -973,10 +973,9 @@ normalize_tfidf <- function(
 ) {
   if (rlang::is_missing(mat)) {
     return(
-      purrr::partial(
-        normalize_tfidf, 
-        feature_means = feature_means, scale_factor = scale_factor, 
-        threads = threads
+      partial_explicit(
+        normalize_tfidf, feature_means = feature_means, 
+        scale_factor = scale_factor, threads = threads
       )
     )
   }
diff --git a/r/R/utils.R b/r/R/utils.R
index 4ea62d15..784f6106 100644
--- a/r/R/utils.R
+++ b/r/R/utils.R
@@ -56,4 +56,20 @@ log_progress <- function(msg, add_timestamp = TRUE){
   } else {
     message(msg)
   }
+}
+
+# Helper function to create partial explicit functions
+# This builds upon purrr::partial by allowing for nested partial calls, where each partial call
+# only does partial application of the arguments that were explicitly provided.
+partial_explicit <- function(fn, ...) {
+  args <- rlang::enquos(...)
+  evaluated_args <- purrr::map(args, rlang::eval_tidy)
+  # Fetch the default arguments from the function definition
+  default_args <- formals(fn)
+  # Keep only explicitly provided arguments that were evaluated
+  # where the values are different from the default arguments
+  explicitly_passed_args <- evaluated_args[names(evaluated_args) %in% names(default_args) & 
+                                            !purrr::map2_lgl(evaluated_args, default_args[names(evaluated_args)], identical)]
+  # Return a partially applied version of the function using evaluated arguments
+  return(purrr::partial(fn, !!!explicitly_passed_args))
 }
\ No newline at end of file

From 067b5406f7f8fb26bfa98fd00470ef0fedf262fc Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 13 Jan 2025 21:19:46 -0800
Subject: [PATCH 031/142] [r] add partial calls, update feature selection docs

---
 r/R/singlecell_utils.R                   | 49 ++++++++-------
 r/man/feature_selection.Rd               | 78 ++++++++++++++++++++++++
 r/man/select_features_by_dispersion.Rd   | 42 -------------
 r/man/select_features_by_mean.Rd         | 34 -----------
 r/man/select_features_by_variance.Rd     | 41 -------------
 r/pkgdown/_pkgdown.yml                   |  4 +-
 r/tests/testthat/test-singlecell_utils.R |  8 ++-
 7 files changed, 112 insertions(+), 144 deletions(-)
 create mode 100644 r/man/feature_selection.Rd
 delete mode 100644 r/man/select_features_by_dispersion.Rd
 delete mode 100644 r/man/select_features_by_mean.Rd
 delete mode 100644 r/man/select_features_by_variance.Rd

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 9a8993f7..40babbe3 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -11,34 +11,42 @@
 # Feature selection
 #################
 
-#' Get the most variable features within a matrix.
+#' Feature selection functions
+#' 
+#' Apply a feature selection method to a `(features x cells)` matrix.
+#' @rdname feature_selection
 #' @param mat (IterableMatrix) dimensions features x cells
 #' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
 #' all features will be returned.
 #' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed.
 #' @param threads (integer) Number of threads to use.
 #' @returns
-#' Return a dataframe with the following columns, sorted descending by variance:
+#' Return a dataframe with the following columns, sorted descending by score:
 #' - `names`: Feature name.
-#' - `score`: Variance of the feature.
+#' - `score`: Scoring of the feature, depending on the method used.  
 #' - `highly_variable`: Logical vector of whether the feature is highly variable.
+#'
+#' Each different feature selection method will have a different scoring method:
+#' - `select_features_by_variance`: Score representing variance of each feature.
 #' @details 
-#' Calculate using the following process:
+#' `select_features_by_variance` Calculates the variance of each feature using the following process:
 #'  1. Perform an optional term frequency + log normalization, for each feature.
-#'  2. Find `num_feats` features with the highest variance across clusters.
+#'  2. Find `num_feats` features with the highest variance.
 #' @export
 select_features_by_variance <- function(
   mat, num_feats = 25000, 
   normalize = normalize_log,
   threads = 1L
 ) {
+  if (rlang::is_missing(mat)) {
+    return(purrr::partial(select_features_by_variance, num_feats = num_feats, normalize = normalize, threads = threads))
+  }
   assert_is(mat, "IterableMatrix")
   assert_greater_than_zero(num_feats)
   assert_is_wholenumber(num_feats)
   assert_len(num_feats, 1)
   assert_is(num_feats, "numeric")
   num_feats <- min(max(num_feats, 0), nrow(mat))
-
   if (!is.null(normalize)) mat <- normalize(mat, threads = threads)
   features_df <- tibble::tibble(
     names = rownames(mat),
@@ -50,15 +58,11 @@ select_features_by_variance <- function(
 }
 
 
-#' Get the features with the highest dispersion within a matrix.
+#' @rdname feature_selection
 #' @returns
-#' Return a dataframe with the following columns, sorted descending by dispersion:
-#'   - `names`: Feature name.
-#'   - `score`: Variance of the feature.
-#'   - `highly_variable`: Logical vector of whether the feature is highly variable.
-#' @inheritParams select_features_by_variance
+#'  - `select_features_by_dispersion`: Score representing the dispersion of each feature.
 #' @details 
-#' Calculate using the following process:
+#' `select_features_by_dispersion` calculates the dispersion of each feature using the following process:
 #'  1. Perform an optional term frequency + log normalization, for each feature.
 #'  2. Find the dispersion (variance/mean) of each feature.
 #'  3. Find `num_feats` features with the highest dispersion.
@@ -67,6 +71,9 @@ select_features_by_dispersion <- function(
   normalize = NULL,
   threads = 1L
 ) {
+  if (rlang::is_missing(mat)) {
+    return(partial_explicit(select_features_by_dispersion, num_feats = num_feats, normalize = normalize, threads = threads))
+  }
   assert_is(mat, "IterableMatrix")
   assert_greater_than_zero(num_feats)
   assert_is_wholenumber(num_feats)
@@ -86,21 +93,18 @@ select_features_by_dispersion <- function(
 }
 
 
-#' Get the top features from a matrix, based on the mean accessibility of each feature.
-#' @param num_feats Number of features to deem as highly accessible.  If the number is higher than the number of features in the matrix,
-#' all features will be returned.
-#' @inheritParams select_features_by_variance
+#' @rdname feature_selection
 #' @returns
-#' Return a dataframe with the following columns, sorted descending by mean accessibility:
-#'   - `names`: Feature name.
-#'   - `score`: Binarize sum of each feature.
-#'   - `highly_variable`: Logical vector of whether the feature is highly accessible by mean accessibility.
+#' - `select_features_by_mean`: Score representing the mean accessibility of each feature.
 #' @details 
-#' Calculate using the following process:
+#' `select_features_by_mean` calculates the mean accessibility of each feature using the following process:
 #' 1. Get the sum of each binarized feature.
 #' 2. Find `num_feats` features with the highest accessibility.
 #' @export
 select_features_by_mean <- function(mat, num_feats = 25000, threads = 1L) {
+  if (rlang::is_missing(mat)) {
+    return(partial_explicit(select_features_by_mean, num_feats = num_feats, threads = threads))
+  }
   assert_is(mat, "IterableMatrix")
   assert_is_wholenumber(num_feats)
   assert_greater_than_zero(num_feats)
@@ -182,7 +186,6 @@ marker_features <- function(mat, groups, method="wilcoxon") {
     )
 }
 
-
 #' Aggregate counts matrices by cell group or feature.
 #'
 #' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
new file mode 100644
index 00000000..c57a8e40
--- /dev/null
+++ b/r/man/feature_selection.Rd
@@ -0,0 +1,78 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{select_features_by_variance}
+\alias{select_features_by_variance}
+\alias{select_features_by_dispersion}
+\alias{select_features_by_mean}
+\title{Feature selection functions}
+\usage{
+select_features_by_variance(
+  mat,
+  num_feats = 25000,
+  normalize = normalize_log,
+  threads = 1L
+)
+
+select_features_by_dispersion(
+  mat,
+  num_feats = 25000,
+  normalize = NULL,
+  threads = 1L
+)
+
+select_features_by_mean(mat, num_feats = 25000, threads = 1L)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
+all features will be returned.}
+
+\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+Return a dataframe with the following columns, sorted descending by score:
+\itemize{
+\item \code{names}: Feature name.
+\item \code{score}: Scoring of the feature, depending on the method used.
+\item \code{highly_variable}: Logical vector of whether the feature is highly variable.
+}
+
+Each different feature selection method will have a different scoring method:
+\itemize{
+\item \code{select_features_by_variance}: Score representing variance of each feature.
+}
+
+\itemize{
+\item \code{select_features_by_dispersion}: Score representing the dispersion of each feature.
+}
+
+\itemize{
+\item \code{select_features_by_mean}: Score representing the mean accessibility of each feature.
+}
+}
+\description{
+Apply a feature selection method to a \verb{(features x cells)} matrix.
+}
+\details{
+\code{select_features_by_variance} Calculates the variance of each feature using the following process:
+\enumerate{
+\item Perform an optional term frequency + log normalization, for each feature.
+\item Find \code{num_feats} features with the highest variance.
+}
+
+\code{select_features_by_dispersion} calculates the dispersion of each feature using the following process:
+\enumerate{
+\item Perform an optional term frequency + log normalization, for each feature.
+\item Find the dispersion (variance/mean) of each feature.
+\item Find \code{num_feats} features with the highest dispersion.
+}
+
+\code{select_features_by_mean} calculates the mean accessibility of each feature using the following process:
+\enumerate{
+\item Get the sum of each binarized feature.
+\item Find \code{num_feats} features with the highest accessibility.
+}
+}
diff --git a/r/man/select_features_by_dispersion.Rd b/r/man/select_features_by_dispersion.Rd
deleted file mode 100644
index 2835c9a8..00000000
--- a/r/man/select_features_by_dispersion.Rd
+++ /dev/null
@@ -1,42 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/singlecell_utils.R
-\name{select_features_by_dispersion}
-\alias{select_features_by_dispersion}
-\title{Get the features with the highest dispersion within a matrix.}
-\usage{
-select_features_by_dispersion(
-  mat,
-  num_feats = 25000,
-  normalize = NULL,
-  threads = 1L
-)
-}
-\arguments{
-\item{mat}{(IterableMatrix) dimensions features x cells}
-
-\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
-all features will be returned.}
-
-\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
-
-\item{threads}{(integer) Number of threads to use.}
-}
-\value{
-Return a dataframe with the following columns, sorted descending by dispersion:
-\itemize{
-\item \code{names}: Feature name.
-\item \code{score}: Variance of the feature.
-\item \code{highly_variable}: Logical vector of whether the feature is highly variable.
-}
-}
-\description{
-Get the features with the highest dispersion within a matrix.
-}
-\details{
-Calculate using the following process:
-\enumerate{
-\item Perform an optional term frequency + log normalization, for each feature.
-\item Find the dispersion (variance/mean) of each feature.
-\item Find \code{num_feats} features with the highest dispersion.
-}
-}
diff --git a/r/man/select_features_by_mean.Rd b/r/man/select_features_by_mean.Rd
deleted file mode 100644
index c05b0acb..00000000
--- a/r/man/select_features_by_mean.Rd
+++ /dev/null
@@ -1,34 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/singlecell_utils.R
-\name{select_features_by_mean}
-\alias{select_features_by_mean}
-\title{Get the top features from a matrix, based on the mean accessibility of each feature.}
-\usage{
-select_features_by_mean(mat, num_feats = 25000, threads = 1L)
-}
-\arguments{
-\item{mat}{(IterableMatrix) dimensions features x cells}
-
-\item{num_feats}{Number of features to deem as highly accessible.  If the number is higher than the number of features in the matrix,
-all features will be returned.}
-
-\item{threads}{(integer) Number of threads to use.}
-}
-\value{
-Return a dataframe with the following columns, sorted descending by mean accessibility:
-\itemize{
-\item \code{names}: Feature name.
-\item \code{score}: Binarize sum of each feature.
-\item \code{highly_variable}: Logical vector of whether the feature is highly accessible by mean accessibility.
-}
-}
-\description{
-Get the top features from a matrix, based on the mean accessibility of each feature.
-}
-\details{
-Calculate using the following process:
-\enumerate{
-\item Get the sum of each binarized feature.
-\item Find \code{num_feats} features with the highest accessibility.
-}
-}
diff --git a/r/man/select_features_by_variance.Rd b/r/man/select_features_by_variance.Rd
deleted file mode 100644
index b7cc375f..00000000
--- a/r/man/select_features_by_variance.Rd
+++ /dev/null
@@ -1,41 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/singlecell_utils.R
-\name{select_features_by_variance}
-\alias{select_features_by_variance}
-\title{Get the most variable features within a matrix.}
-\usage{
-select_features_by_variance(
-  mat,
-  num_feats = 25000,
-  normalize = normalize_log,
-  threads = 1L
-)
-}
-\arguments{
-\item{mat}{(IterableMatrix) dimensions features x cells}
-
-\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
-all features will be returned.}
-
-\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
-
-\item{threads}{(integer) Number of threads to use.}
-}
-\value{
-Return a dataframe with the following columns, sorted descending by variance:
-\itemize{
-\item \code{names}: Feature name.
-\item \code{score}: Variance of the feature.
-\item \code{highly_variable}: Logical vector of whether the feature is highly variable.
-}
-}
-\description{
-Get the most variable features within a matrix.
-}
-\details{
-Calculate using the following process:
-\enumerate{
-\item Perform an optional term frequency + log normalization, for each feature.
-\item Find \code{num_feats} features with the highest variance across clusters.
-}
-}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 70b77d96..0bf475aa 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -132,9 +132,6 @@ reference:
   - checksum
   - apply_by_row
   - regress_out
-  - select_features_by_variance
-  - select_features_by_dispersion
-  - select_features_by_mean
   - IterableMatrix-methods
   - pseudobulk_matrix
 
@@ -142,6 +139,7 @@ reference:
 - subtitle: "Dimensionality reduction"
 - contents:
   - normalize_log
+  - select_features_by_variance
 - subtitle: "Clustering"
 - contents:
   - knn_hnsw
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 2b843945..efefc070 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -21,11 +21,17 @@ test_that("select_features works general case", {
     expect_equal(sum(res$highly_variable), 10) # Only 10 features marked as highly variable
     expect_setequal(res$names, rownames(m1))
     res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows
-    res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) 
+    res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100))
+    res_feats_partial <- get(fn)(num_feats = 100)(m1)
+    expect_identical(res_feats_equal_rows, res_feats_partial)
     expect_identical(res_more_feats_than_rows, res_feats_equal_rows)
     if (fn == "select_features_by_variance") {
       # Check that normalization actually does something
       res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL))
+      # Check that we can do partial functions on normalization too
+      res_norm_partial <- do.call(fn, list(m1, num_feats = 10, normalize = purrr::partial(normalize_log(scale = 1e3, threads = 1L))))
+      res_norm_implicit_partial <- select_features_by_variance(normalize = normalize_log(scale_factor = 1e3), num_feats = 10)(m1)
+      expect_identical(res_norm_partial, res_norm_implicit_partial)
       expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score))
     }
   }

From 5e49504eae4d555d830cbc7f43713e29c8cfde77 Mon Sep 17 00:00:00 2001
From: Ben Parks <bnprks+git@gmail.com>
Date: Fri, 10 Jan 2025 16:51:00 -0800
Subject: [PATCH 032/142] Update docs

---
 r/R/transforms.R         | 28 ++++++++++++++-----------
 r/man/normalize.Rd       | 45 ++++++++++++++++++++++++++++++++++++++++
 r/man/normalize_log.Rd   | 23 --------------------
 r/man/normalize_tfidf.Rd | 27 ------------------------
 r/pkgdown/_pkgdown.yml   | 20 +++++++++++-------
 5 files changed, 73 insertions(+), 70 deletions(-)
 create mode 100644 r/man/normalize.Rd
 delete mode 100644 r/man/normalize_log.Rd
 delete mode 100644 r/man/normalize_tfidf.Rd

diff --git a/r/R/transforms.R b/r/R/transforms.R
index 8a2bd25b..e79c404d 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -928,13 +928,19 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 # Normalizations
 #################
 
-#' Normalize a `(features x cells)` matrix using log normalization.
-#' @param mat (IterableMatrix) Matrix to normalize.
+#' Normalization recipes
+#' 
+#' Apply standard normalizations to a `(features x cells)` counts matrix.
+#' 
+#' @rdname normalize
+#' @param mat (IterableMatrix) Counts matrix to normalize. `(features x cells)`
 #' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization.
-#' @param threads (integer) Number of threads to use.s
-#' @returns log normalized matrix.  For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, 
-#'  the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
-#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
+#' @param threads (integer) Number of threads to use.
+#' @returns For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, 
+#'  transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as:
+#'  
+#'   - `normalize_log`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
+#' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog`
 #' @export
 normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
   assert_is(mat, "IterableMatrix")
@@ -946,14 +952,12 @@ normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
 }
 
 
-#' Normalize a `(features x cells)` matrix using term frequency-inverse document frequency.
-#' @param feature_means (numeric) Means of the features to normalize by.  If no names are provided, then 
+#' @rdname normalize
+#' @param feature_means (numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then 
 #' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
 #' Else, map each feature name to its mean value.
-#' @returns tf-idf normalized matrix.  For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
-#' the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
-#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
-#' @inheritParams normalize_log
+#' @returns - `normalize_tfidf`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
+#' @details - `normalize_tfidf`: This follows the formula from Stuart, Butler et al. 2019, used by default in `ArchR::addIterativeLSI()` and `Signac::RunTFIDF()`
 #' @export
 normalize_tfidf <- function(
   mat, feature_means = NULL,
diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd
new file mode 100644
index 00000000..f4bde193
--- /dev/null
+++ b/r/man/normalize.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transforms.R
+\name{normalize_log}
+\alias{normalize_log}
+\alias{normalize_tfidf}
+\title{Normalization recipes}
+\usage{
+normalize_log(mat, scale_factor = 10000, threads = 1L)
+
+normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L)
+}
+\arguments{
+\item{mat}{(IterableMatrix) Counts matrix to normalize. \verb{(features x cells)}}
+
+\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.}
+
+\item{threads}{(integer) Number of threads to use.}
+
+\item{feature_means}{(numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then
+each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
+Else, map each feature name to its mean value.}
+}
+\value{
+For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
+transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as:
+\itemize{
+\item \code{normalize_log}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
+}
+
+\itemize{
+\item \code{normalize_tfidf}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
+}
+}
+\description{
+Apply standard normalizations to a \verb{(features x cells)} counts matrix.
+}
+\details{
+\itemize{
+\item \code{normalize_log}: Corresponds to \code{Seurat::NormalizeLog}
+}
+
+\itemize{
+\item \code{normalize_tfidf}: This follows the formula from Stuart, Butler et al. 2019, used by default in \code{ArchR::addIterativeLSI()} and \code{Signac::RunTFIDF()}
+}
+}
diff --git a/r/man/normalize_log.Rd b/r/man/normalize_log.Rd
deleted file mode 100644
index 97d8c92e..00000000
--- a/r/man/normalize_log.Rd
+++ /dev/null
@@ -1,23 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/transforms.R
-\name{normalize_log}
-\alias{normalize_log}
-\title{Normalize a \verb{(features x cells)} matrix using log normalization.}
-\usage{
-normalize_log(mat, scale_factor = 10000, threads = 1L)
-}
-\arguments{
-\item{mat}{(IterableMatrix) Matrix to normalize.}
-
-\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.}
-
-\item{threads}{(integer) Number of threads to use.s}
-}
-\value{
-log normalized matrix.  For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
-the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
-\eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
-}
-\description{
-Normalize a \verb{(features x cells)} matrix using log normalization.
-}
diff --git a/r/man/normalize_tfidf.Rd b/r/man/normalize_tfidf.Rd
deleted file mode 100644
index 8dc50b84..00000000
--- a/r/man/normalize_tfidf.Rd
+++ /dev/null
@@ -1,27 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/transforms.R
-\name{normalize_tfidf}
-\alias{normalize_tfidf}
-\title{Normalize a \verb{(features x cells)} matrix using term frequency-inverse document frequency.}
-\usage{
-normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L)
-}
-\arguments{
-\item{mat}{(IterableMatrix) Matrix to normalize.}
-
-\item{feature_means}{(numeric) Means of the features to normalize by.  If no names are provided, then
-each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
-Else, map each feature name to its mean value.}
-
-\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.}
-
-\item{threads}{(integer) Number of threads to use.s}
-}
-\value{
-tf-idf normalized matrix.  For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
-the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
-\eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
-}
-\description{
-Normalize a \verb{(features x cells)} matrix using term frequency-inverse document frequency.
-}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 64f1f119..471e06f3 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -141,15 +141,11 @@ reference:
   - IterableMatrix-methods
   - pseudobulk_matrix
 
-- title: "Reference Annotations"
+- title: "Single-cell analysis helpers"
+- subtitle: "Dimensionality reduction"
 - contents:
-  - human_gene_mapping
-  - match_gene_symbol
-  - read_gtf
-  - read_bed
-  - read_ucsc_chrom_sizes
-
-- title: "Clustering"
+  - normalize_log
+- subtitle: "Clustering"
 - contents:
   - knn_hnsw
   - cluster_graph_leiden
@@ -192,3 +188,11 @@ reference:
   - discrete_palette
   - collect_features
   - rotate_x_labels
+
+- title: "Reference Annotations"
+- contents:
+  - human_gene_mapping
+  - match_gene_symbol
+  - read_gtf
+  - read_bed
+  - read_ucsc_chrom_sizes

From 453215f607cc8fcc41235814abd71b8a6c0a8d9e Mon Sep 17 00:00:00 2001
From: Immanuel Abdi <56730419+immanuelazn@users.noreply.github.com>
Date: Mon, 16 Dec 2024 19:55:38 -0800
Subject: [PATCH 033/142] [ci] add update to apt-get (#164)

---
 .github/workflows/r-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/r-test.yml b/.github/workflows/r-test.yml
index 3a41fe65..aadedf6d 100644
--- a/.github/workflows/r-test.yml
+++ b/.github/workflows/r-test.yml
@@ -17,7 +17,7 @@ jobs:
       - uses: actions/checkout@v4
       - name: Install binary dependencies on ubuntu
         if: ${{ matrix.os == 'ubuntu-latest' }}
-        run: sudo apt-get install -y libhdf5-dev
+        run: sudo apt-get update && sudo apt-get install -y libhdf5-dev
       - name: Install binary dependencies on mac
         if: ${{ matrix.os == 'macos-latest' || matrix.os == 'macos-13' }}
         run: brew install hdf5

From 3c99a0172cad79ab671b2c08a0115f5a489831eb Mon Sep 17 00:00:00 2001
From: Ben Parks <bnprks@users.noreply.github.com>
Date: Mon, 16 Dec 2024 20:02:19 -0800
Subject: [PATCH 034/142] [r] Fix articles index to include manuscript draft
 (#170)

---
 r/pkgdown/_pkgdown.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 471e06f3..30cf2eeb 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -59,6 +59,7 @@ articles:
   - "web-only/programming-efficiency"
   - "web-only/programming-philosophy"
   - "web-only/developer-notes"
+  - "web-only/manuscript-draft"
 
 # cosmo, flatly, united, sandstone all look reasonable
 # pulse, lumen, zephyr

From 5a8a3354e61ab9c85d103991667424376451852c Mon Sep 17 00:00:00 2001
From: Ben Parks <bnprks@users.noreply.github.com>
Date: Sat, 21 Dec 2024 15:56:14 -0800
Subject: [PATCH 035/142] [r] Fix type confusion in `pseudobulk_matrix()` and
 clean up `parallel_split()` (#174)

---
 r/NEWS.md                                |  1 +
 r/R/matrix.R                             | 28 ++++++++++--------------
 r/R/singlecell_utils.R                   | 15 ++++++++-----
 r/tests/testthat/test-singlecell_utils.R |  3 ++-
 4 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/r/NEWS.md b/r/NEWS.md
index 86d75556..26129125 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -45,6 +45,7 @@ Contributions welcome :)
 - Fixed discrepancy between default ArchR and BPCells peak calling insertion method, where BPCells defaulted to only using the start of each fragment as opposed to ArchR's method of using both start and end sites of fragments (pull request #143)
 - Fix error in `tile_matrix()` with fragment mode (pull request #141)
 - Fix precision bug in `sctransform_pearson()` on ARM architecture (pull request #141) 
+- Fix type-confusion error when `pseudobulk_matrix()` gets an integer matrix (pull request #174)
 
 ## Deprecations
 - `trackplot_coverage()` `legend_label` argument is now ignored, as the color legend is no longer shown by default for coverage plots.
diff --git a/r/R/matrix.R b/r/R/matrix.R
index 66e391c1..011712e8 100644
--- a/r/R/matrix.R
+++ b/r/R/matrix.R
@@ -1378,6 +1378,10 @@ parallel_split <- function(mat, threads, chunks=threads) {
   assert_is_wholenumber(chunks)
   assert_true(chunks >= threads)
 
+  if (threads <= 1L) {
+    return(mat)
+  }
+
   if (mat@transpose) {
     return(t(parallel_split(t(mat), threads, chunks)))
   }
@@ -2783,14 +2787,10 @@ matrix_stats <- function(matrix,
   row_stats_number <- match(row_stats, stat_options) - 1
   col_stats_number <- match(col_stats, stat_options) - 1
 
-  matrix <- convert_matrix_type(matrix, "double")
-  if (threads == 0L) {
-    it <- iterate_matrix(matrix)
-  } else {
-    it <- iterate_matrix(
-      parallel_split(matrix, threads, threads*4)
-    )
-  }
+  it <- matrix %>%
+    convert_matrix_type("double") %>%
+    parallel_split(threads, threads*4) %>%
+    iterate_matrix()
   res <- matrix_stats_cpp(it, row_stats_number, col_stats_number)
   rownames(res$row_stats) <- stat_options[seq_len(row_stats_number) + 1]
   rownames(res$col_stats) <- stat_options[seq_len(col_stats_number) + 1]
@@ -2847,14 +2847,10 @@ svds.IterableMatrix <- function(A, k, nu = k, nv = k, opts = list(), threads=0,
   )
   solver_params[names(opts)] <- opts
 
-  A <- convert_matrix_type(A, "double")
-  if (threads == 0L) {
-    it <- iterate_matrix(A)
-  } else {
-    it <- iterate_matrix(
-      parallel_split(A, threads, threads*4)
-    )
-  }
+  it <- A %>%
+    convert_matrix_type("double") %>%
+    parallel_split(threads, threads*4) %>%
+    iterate_matrix()
   
   svds_cpp(
     it, 
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 6ec76e37..7db9fd2f 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -439,7 +439,7 @@ marker_features <- function(mat, groups, method="wilcoxon") {
 #' extra calculation time, and when calculating `mean`, adding `nonzeros` will take no extra time.
 #' @inheritParams marker_features
 #' @export
-pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
+pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 0L) {
   assert_is(mat, "IterableMatrix")
   assert_is(cell_groups, c("factor", "character", "numeric"))
   assert_true(length(cell_groups) == ncol(mat))
@@ -452,9 +452,13 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
     }
   }
   assert_is(threads, "integer")
-  # if multiple methods are provided, only need to pass in the top method as it will also calculate the less complex stats
-  iter <- iterate_matrix(parallel_split(mat, threads, threads*4))
-  res <- pseudobulk_matrix_cpp(iter, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose)
+
+  it <- mat %>%
+    convert_matrix_type("double") %>%
+    parallel_split(threads, threads*4) %>%
+    iterate_matrix()
+  
+  res <- pseudobulk_matrix_cpp(it, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose)
   # if res is a single matrix, return with colnames and rownames
   if (length(method) == 1) {
     colnames(res[[method]]) <- levels(cell_groups)
@@ -463,7 +467,8 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
   }
   # give colnames and rownames for each matrix in res, which is a named list
   for (res_slot in names(res)) {
-    if ((length(res[[res_slot]]) == 0) || !(res_slot %in% method)) {
+    # Filter out methods that weren't requested
+    if (!(res_slot %in% method)) {
       res[[res_slot]] <- NULL
     } else {
       colnames(res[[res_slot]]) <- levels(cell_groups)
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 034d765a..bc5a0e03 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -120,7 +120,8 @@ test_that("Pseudobulk aggregation works", {
   m0 <- as.matrix(m0)
   m0_t <- t(m0)
   m1_t <- t(m1)
-  for (matrices in list(list(m0_t, m1_t), list(m0, m1))) {
+  m1_int <- convert_matrix_type(m1, "uint32_t")
+  for (matrices in list(list(m0_t, m1_t), list(m0, m1), list(m0, m1_int))) {
     # Check across two equal groups, one group, and groups of equal length
     m <- matrices[[1]]
     m_bpcells <- matrices[[2]]

From 8e0603bc1aa4f4a631d0c7ae4a90ab3ee64929d3 Mon Sep 17 00:00:00 2001
From: Ben Parks <bnprks@users.noreply.github.com>
Date: Sat, 21 Dec 2024 23:07:02 -0800
Subject: [PATCH 036/142] [r] 0.3.0 release announcement (#177)

---
 r/DESCRIPTION    |  3 ++-
 r/NEWS.md        | 15 ++++++++++++---
 r/R/atac_utils.R |  2 +-
 r/R/trackplots.R |  4 ++--
 4 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index 14dd90c9..17faadb9 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -1,12 +1,13 @@
 Package: BPCells
 Title: Single Cell Counts Matrices to PCA
-Version: 0.2.0
+Version: 0.3.0
 Authors@R: c(
     person(given = "Benjamin",
            family = "Parks",
            role = c("aut", "cre", "cph"),
            email = "bparks@alumni.stanford.edu",
            comment = c(ORCID = "0000-0002-0261-7472")),
+    person("Immanuel", "Abdi", role = "aut"),
     person("Stanford University", role=c("cph", "fnd")),
     person("Genentech, Inc.", role=c("cph", "fnd")))
 Description: > Efficient operations for single cell ATAC-seq fragments and
diff --git a/r/NEWS.md b/r/NEWS.md
index 26129125..c9fa9980 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -1,12 +1,21 @@
 # BPCells 1.0 Roadmap
 - ~~Parallelization~~ (basic support complete. See below)
-- Native python library (re-using C++ backend)
+- Native python library (re-using C++ backend; basic support started)
 - Peak-gene correlations
-- MACS peak calling
+- ~~MACS peak calling~~ (basic support complete. See below)
 
 Contributions welcome :)
 
-# BPCells 0.2.1 (main branch - in progress)
+# BPCells 0.3.0 (12/21/2024)
+
+The BPCells 0.3.0 release covers 6 months of changes and 45 commits from 5 contributors. Notable improvements
+this release include support for peak calling with MACS and the addition of pseudobulk matrix and stats calculations.
+We also released an initial prototype of a BPCells Python library (more details [here](https://bnprks.github.io/BPCells/python/index.html)).
+Full details of changes below.
+
+Thanks to @ycli1995, @Yunuuuu, and @douglasgscofield for pull requests that contributed to this release, as well as to users who
+sumitted github issues to help identify and fix bugs. We also added @immanuelazn to the team as a new hire! He is responsible for many
+of the new features this release and will continue to help with maintenance and new development moving forwards. 
 
 ## Features
 - `apply_by_col()` and `apply_by_row()` allow providing custom R functions to compute per row/col summaries.
diff --git a/r/R/atac_utils.R b/r/R/atac_utils.R
index 9bd797db..b0b6e900 100644
--- a/r/R/atac_utils.R
+++ b/r/R/atac_utils.R
@@ -625,7 +625,7 @@ call_peaks_macs <- function(fragments, path,
 #' @export
 #' @keywords internal
 call_macs_peaks <- function(...) {
-  lifecycle::deprecate_warn("0.2.0", "call_macs_peaks()", "call_peaks_macs()")
+  lifecycle::deprecate_warn("0.3.0", "call_macs_peaks()", "call_peaks_macs()")
   return(call_peaks_macs(...))
 }
 
diff --git a/r/R/trackplots.R b/r/R/trackplots.R
index 357cd3d1..998bfecd 100644
--- a/r/R/trackplots.R
+++ b/r/R/trackplots.R
@@ -428,7 +428,7 @@ trackplot_coverage <- function(fragments, region, groups,
   assert_is(colors, "character")
   assert_true(length(colors) >= length(unique(groups)))
   if (!is.null(legend_label)) {
-    lifecycle::deprecate_warn("0.2.0", "trackplot_coverage(legend_label)", details="Argument value is no longer used since color legend is not shown.")
+    lifecycle::deprecate_warn("0.3.0", "trackplot_coverage(legend_label)", details="Argument value is no longer used since color legend is not shown.")
   }
 
   groups <- as.factor(groups)
@@ -1033,7 +1033,7 @@ trackplot_bulk <- function(fragments, region, groups,
                            legend_label = "group",
                            zero_based_coords = !is(region, "GRanges"),
                            return_data = FALSE, return_plot_list = FALSE, apply_styling = TRUE) {
-  lifecycle::deprecate_warn("0.2.0", "trackplot_bulk()", "trackplot_combine()")
+  lifecycle::deprecate_warn("0.2.0", "trackplot_bulk()", "trackplot_coverage()")
 
   assert_is(fragments, "IterableFragments")
   assert_not_null(cellNames(fragments))

From 4391202c31c1f61e300463e2924e6e1dfeefc0b0 Mon Sep 17 00:00:00 2001
From: Ben Parks <bnprks@users.noreply.github.com>
Date: Wed, 8 Jan 2025 17:12:43 -0800
Subject: [PATCH 037/142] [r] Improve error printing in `call_peaks_macs`
 (#175)

---
 r/NEWS.md                          |  5 ++++
 r/R/atac_utils.R                   | 10 ++++++--
 r/tests/testthat/test-atac_utils.R | 39 ++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/r/NEWS.md b/r/NEWS.md
index c9fa9980..28a159c6 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -6,6 +6,11 @@
 
 Contributions welcome :)
 
+# BPCells 0.3.1 (in-progress main branch)
+
+## Bug-fixes
+- Fix error message printing when MACS crashes during `call_peaks_macs()` (pull request #175) 
+
 # BPCells 0.3.0 (12/21/2024)
 
 The BPCells 0.3.0 release covers 6 months of changes and 45 commits from 5 contributors. Notable improvements
diff --git a/r/R/atac_utils.R b/r/R/atac_utils.R
index b0b6e900..ce7aa930 100644
--- a/r/R/atac_utils.R
+++ b/r/R/atac_utils.R
@@ -576,7 +576,7 @@ call_peaks_macs <- function(fragments, path,
     if (length(file_names) != length(levels(cell_groups))) {
       warning("Number of shell files does not match number of clusters")
     }
-    parallel::mclapply(file_names, function(shell_file) {
+    macs_success <- parallel::mclapply(file_names, function(shell_file) {
       cluster <- gsub(".sh$", "", shell_file)
       if (verbose) log_progress(paste0("Running MACS for cluster: ", cluster))
       dir.create(file.path(path, "output", cluster), showWarnings = FALSE, recursive = TRUE)
@@ -584,13 +584,19 @@ call_peaks_macs <- function(fragments, path,
       macs_message <- system2("bash", sprintf("'%s'", file.path(path, "input", shell_file)),stdout = log_file, stderr = log_file, env = c("OMP_NUM_THREADS=1"))
       # Try detecting if macs failed before writing that cluster is finished
       if (macs_message != 0) {
-        stop(paste0(format(Sys.time(), "%Y-%m-%d %H:%M:%S"), " Error running MACS for cluster: ", cluster, "\n",
+        log_progress(paste0(" Error running MACS for cluster: ", cluster, "\n",
                     "MACS log file written to: ", log_file))
+        return(FALSE)
       } else if (verbose) {
         log_progress(paste0(" Finished running MACS for cluster: ", cluster))
         log_progress(paste0(" MACS log file written to: ", log_file))
       }
+      return(TRUE)
     }, mc.cores = threads, mc.preschedule = FALSE)
+    failures <- sum(!unlist(macs_success))
+    if (failures > 0) {
+      rlang::abort(c(sprintf("MACS calls encountered %d failures", failures), "See error logs listed above"))
+    }
   }
   # Read outputs
   if (step %in%  c("read-outputs", "all")) {
diff --git a/r/tests/testthat/test-atac_utils.R b/r/tests/testthat/test-atac_utils.R
index d2a51167..769f2b05 100644
--- a/r/tests/testthat/test-atac_utils.R
+++ b/r/tests/testthat/test-atac_utils.R
@@ -333,4 +333,43 @@ test_that("macs_e2e_works", {
   )
   # Make sure the outputs are the same
   expect_equal(macs_read, macs_read_full_pipeline)
+})
+
+test_that("macs errors print when running in parallel", {
+  # The dummy macs script only works on a unix setup, and windows currently doesn't support
+  # multi-threading anyhow
+  skip_on_os("windows")
+  dir <- withr::local_tempdir()
+
+  # Make a dummy macs script that will register as valid but won't actually run
+  bad_macs_path <- file.path(dir, "bad_macs.sh")
+  writeLines(c(
+    "#!/bin/bash",
+    "if [[ $1 == '--version' ]]; then",
+    "    echo 'Bad macs demo'",
+    "else",
+    "    exit 1",
+    "fi"
+  ), bad_macs_path)
+  Sys.chmod(bad_macs_path, "0700")
+
+  frags <- tibble::tibble(chr="chr1", start=1:10, end=start+5, cell_id=rep(c("a","b"), 5)) %>% convert_to_fragments() 
+  call_peaks_macs(
+    frags, 
+    path=file.path(dir, "macs-test"), 
+    cell_groups=c(a="a_group", b="b_group"), 
+    step="prep-inputs",
+    macs_executable=bad_macs_path, 
+    threads=2
+  )
+  expect_error({
+    call_peaks_macs(
+      frags,
+      path=file.path(dir, "macs-test"), 
+      cell_groups=c(a="a_group", b="b_group"), 
+      step="run-macs",
+      macs_executable=bad_macs_path, 
+      threads=2
+    )
+  }, "MACS calls encountered .* failures")
 })
\ No newline at end of file

From f1232b0a6efddeb3b79ce5d3def3b384e2ccd7d9 Mon Sep 17 00:00:00 2001
From: Yuchen Li <105353386+ycli1995@users.noreply.github.com>
Date: Thu, 9 Jan 2025 15:33:37 +0800
Subject: [PATCH 038/142] [r][cpp] Support writing AnnData dense matrices
 (#166)

---
 r/NAMESPACE                                   |   1 +
 r/NEWS.md                                     |   3 +
 r/R/RcppExports.R                             |   4 +
 r/R/matrix.R                                  |  44 +++++-
 r/man/open_matrix_anndata_hdf5.Rd             |  31 +++-
 r/src/Makevars.in                             |   1 +
 r/src/RcppExports.cpp                         |  17 ++
 .../matrixIterators/H5DenseMatrixWriter.cpp   | 149 ++++++++++++++++++
 .../matrixIterators/H5DenseMatrixWriter.h     |  35 ++++
 r/src/matrix_io.cpp                           |  45 ++++++
 r/tests/testthat/test-matrix_io.R             |  65 ++++++++
 11 files changed, 383 insertions(+), 12 deletions(-)
 create mode 100644 r/src/bpcells-cpp/matrixIterators/H5DenseMatrixWriter.cpp
 create mode 100644 r/src/bpcells-cpp/matrixIterators/H5DenseMatrixWriter.h

diff --git a/r/NAMESPACE b/r/NAMESPACE
index b296a27f..4892b6d5 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -142,6 +142,7 @@ export(write_fragments_memory)
 export(write_insertion_bedgraph)
 export(write_matrix_10x_hdf5)
 export(write_matrix_anndata_hdf5)
+export(write_matrix_anndata_hdf5_dense)
 export(write_matrix_dir)
 export(write_matrix_hdf5)
 export(write_matrix_memory)
diff --git a/r/NEWS.md b/r/NEWS.md
index 28a159c6..455c04c5 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -8,6 +8,9 @@ Contributions welcome :)
 
 # BPCells 0.3.1 (in-progress main branch)
 
+## Features
+- Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166)
+
 ## Bug-fixes
 - Fix error message printing when MACS crashes during `call_peaks_macs()` (pull request #175) 
 
diff --git a/r/R/RcppExports.R b/r/R/RcppExports.R
index c656fba3..1d2ed577 100644
--- a/r/R/RcppExports.R
+++ b/r/R/RcppExports.R
@@ -457,6 +457,10 @@ write_matrix_anndata_hdf5_cpp <- function(matrix, file, group, type, row_major,
     invisible(.Call(`_BPCells_write_matrix_anndata_hdf5_cpp`, matrix, file, group, type, row_major, buffer_size, chunk_size, gzip_level))
 }
 
+write_matrix_anndata_hdf5_dense_cpp <- function(matrix, file, dataset, type, row_major, chunk_size, gzip_level) {
+    invisible(.Call(`_BPCells_write_matrix_anndata_hdf5_dense_cpp`, matrix, file, dataset, type, row_major, chunk_size, gzip_level))
+}
+
 read_hdf5_string_cpp <- function(path, group, buffer_size) {
     .Call(`_BPCells_read_hdf5_string_cpp`, path, group, buffer_size)
 }
diff --git a/r/R/matrix.R b/r/R/matrix.R
index 011712e8..e5a74bc4 100644
--- a/r/R/matrix.R
+++ b/r/R/matrix.R
@@ -2136,22 +2136,32 @@ setMethod("short_description", "AnnDataMatrixH5", function(x) {
 
 #' Read/write AnnData matrix
 #'
+#' @description
 #' Read or write a matrix from an anndata hdf5 file. These functions will
 #' automatically transpose matrices when converting to/from the AnnData
 #' format. This is because the AnnData convention stores cells as rows, whereas the R
 #' convention stores cells as columns. If this behavior is undesired, call `t()`
 #' manually on the matrix inputs and outputs of these functions. 
-#' 
+#'
+#' Most users writing to AnnData files should default to `write_matrix_anndata_hdf5()` rather
+#' than the dense variant (see details for more information).
+#'
 #' @inheritParams open_matrix_hdf5
 #' @return AnnDataMatrixH5 object, with cells as the columns.
-#' @details Dimnames are inferred from `obs/_index` or `var/_index` based on length matching.
+#' @details 
+#'   **Efficiency considerations**: Reading from a dense AnnData matrix will generally be slower
+#'   than sparse for single cell datasets, so it is recommended to re-write any dense AnnData
+#'   inputs to a sparse format early in processing.
+#'
+#'   `write_matrix_anndata_hdf5()` should be used by default, as it always writes in the more efficient sparse format.
+#'   `write_matrix_anndata_hdf5_dense()` writes in the AnnData dense format, and can be used for smaller matrices 
+#'   when efficiency and file size are less of a concern than increased portability (e.g. writing to `obsm` or `varm` matrices).
+#'   See the [AnnData docs](https://anndata.readthedocs.io/en/latest/fileformat-prose.html#dense-arrays) for format details.
+#'
+#'   **Dimension names:** Dimnames are inferred from `obs/_index` or `var/_index` based on length matching.
 #'   This helps to infer dimnames for `obsp`,` varm`, etc. If the number of `len(obs) == len(var)`,
 #'   dimname inference will be disabled.
 #'
-#'   *Efficiency considerations*: Reading from a dense AnnData matrix will generally be slower
-#'   than sparse for single cell datasets, so it is recommended to re-write any dense AnnData
-#'   inputs to a sparse format early in processing. Note that `write_matrix_hdf5()` will only
-#'   write in the sparse format.
 #' @export
 open_matrix_anndata_hdf5 <- function(path, group = "X", buffer_size = 16384L) {
   assert_is_file(path)
@@ -2191,6 +2201,28 @@ write_matrix_anndata_hdf5 <- function(mat, path, group = "X", buffer_size = 1638
   open_matrix_anndata_hdf5(path, group, buffer_size)
 }
 
+#' @rdname open_matrix_anndata_hdf5
+#' @inheritParams write_matrix_anndata_hdf5
+#' 
+#' @param dataset The dataset within the hdf5 file to write the matrix to. Used for `write_matrix_anndata_hdf5_dense`
+#' 
+#' @export
+write_matrix_anndata_hdf5_dense <- function(mat, path, dataset = "X", buffer_size = 16384L, chunk_size = 1024L, gzip_level = 0L) {
+  assert_is(mat, "IterableMatrix")
+  assert_is(path, "character")
+  mat <- t(mat)
+  write_matrix_anndata_hdf5_dense_cpp(
+    iterate_matrix(mat),
+    path,
+    dataset,
+    matrix_type(mat),
+    mat@transpose,
+    chunk_size,
+    gzip_level
+  )
+  open_matrix_anndata_hdf5(path, dataset, buffer_size)
+}
+
 #' Import MatrixMarket files
 #'
 #' Read a sparse matrix from a MatrixMarket file. This is a text-based format used by 
diff --git a/r/man/open_matrix_anndata_hdf5.Rd b/r/man/open_matrix_anndata_hdf5.Rd
index 8a8759a1..2798ff74 100644
--- a/r/man/open_matrix_anndata_hdf5.Rd
+++ b/r/man/open_matrix_anndata_hdf5.Rd
@@ -3,6 +3,7 @@
 \name{open_matrix_anndata_hdf5}
 \alias{open_matrix_anndata_hdf5}
 \alias{write_matrix_anndata_hdf5}
+\alias{write_matrix_anndata_hdf5_dense}
 \title{Read/write AnnData matrix}
 \usage{
 open_matrix_anndata_hdf5(path, group = "X", buffer_size = 16384L)
@@ -15,6 +16,15 @@ write_matrix_anndata_hdf5(
   chunk_size = 1024L,
   gzip_level = 0L
 )
+
+write_matrix_anndata_hdf5_dense(
+  mat,
+  path,
+  dataset = "X",
+  buffer_size = 16384L,
+  chunk_size = 1024L,
+  gzip_level = 0L
+)
 }
 \arguments{
 \item{path}{Path to the hdf5 file on disk}
@@ -28,6 +38,8 @@ in memory before calling writes to disk.}
 \item{chunk_size}{For performance tuning only. The chunk size used for the HDF5 array storage.}
 
 \item{gzip_level}{Gzip compression level. Default is 0 (no compression)}
+
+\item{dataset}{The dataset within the hdf5 file to write the matrix to. Used for \code{write_matrix_anndata_hdf5_dense}}
 }
 \value{
 AnnDataMatrixH5 object, with cells as the columns.
@@ -38,14 +50,21 @@ automatically transpose matrices when converting to/from the AnnData
 format. This is because the AnnData convention stores cells as rows, whereas the R
 convention stores cells as columns. If this behavior is undesired, call \code{t()}
 manually on the matrix inputs and outputs of these functions.
+
+Most users writing to AnnData files should default to \code{write_matrix_anndata_hdf5()} rather
+than the dense variant (see details for more information).
 }
 \details{
-Dimnames are inferred from \verb{obs/_index} or \verb{var/_index} based on length matching.
+\strong{Efficiency considerations}: Reading from a dense AnnData matrix will generally be slower
+than sparse for single cell datasets, so it is recommended to re-write any dense AnnData
+inputs to a sparse format early in processing.
+
+\code{write_matrix_anndata_hdf5()} should be used by default, as it always writes in the more efficient sparse format.
+\code{write_matrix_anndata_hdf5_dense()} writes in the AnnData dense format, and can be used for smaller matrices
+when efficiency and file size are less of a concern than increased portability (e.g. writing to \code{obsm} or \code{varm} matrices).
+See the \href{https://anndata.readthedocs.io/en/latest/fileformat-prose.html#dense-arrays}{AnnData docs} for format details.
+
+\strong{Dimension names:} Dimnames are inferred from \verb{obs/_index} or \verb{var/_index} based on length matching.
 This helps to infer dimnames for \code{obsp},\code{ varm}, etc. If the number of \code{len(obs) == len(var)},
 dimname inference will be disabled.
-
-\emph{Efficiency considerations}: Reading from a dense AnnData matrix will generally be slower
-than sparse for single cell datasets, so it is recommended to re-write any dense AnnData
-inputs to a sparse format early in processing. Note that \code{write_matrix_hdf5()} will only
-write in the sparse format.
 }
diff --git a/r/src/Makevars.in b/r/src/Makevars.in
index a3d987f4..c816cc37 100644
--- a/r/src/Makevars.in
+++ b/r/src/Makevars.in
@@ -35,6 +35,7 @@ bpcells-cpp/fragmentIterators/StoredFragments.o \
 bpcells-cpp/fragmentUtils/BedWriter.o \
 bpcells-cpp/fragmentUtils/FootprintMatrix.o \
 bpcells-cpp/fragmentUtils/InsertionIterator.o \
+bpcells-cpp/matrixIterators/H5DenseMatrixWriter.o \
 bpcells-cpp/matrixIterators/ImportMatrix10xHDF5.o \
 bpcells-cpp/matrixIterators/ImportMatrixAnnDataHDF5.o \
 bpcells-cpp/matrixIterators/PeakMatrix.o \
diff --git a/r/src/RcppExports.cpp b/r/src/RcppExports.cpp
index 2a9bd6a2..697d8b3f 100644
--- a/r/src/RcppExports.cpp
+++ b/r/src/RcppExports.cpp
@@ -1578,6 +1578,22 @@ BEGIN_RCPP
     return R_NilValue;
 END_RCPP
 }
+// write_matrix_anndata_hdf5_dense_cpp
+void write_matrix_anndata_hdf5_dense_cpp(SEXP matrix, std::string file, std::string dataset, std::string type, bool row_major, uint32_t chunk_size, uint32_t gzip_level);
+RcppExport SEXP _BPCells_write_matrix_anndata_hdf5_dense_cpp(SEXP matrixSEXP, SEXP fileSEXP, SEXP datasetSEXP, SEXP typeSEXP, SEXP row_majorSEXP, SEXP chunk_sizeSEXP, SEXP gzip_levelSEXP) {
+BEGIN_RCPP
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< SEXP >::type matrix(matrixSEXP);
+    Rcpp::traits::input_parameter< std::string >::type file(fileSEXP);
+    Rcpp::traits::input_parameter< std::string >::type dataset(datasetSEXP);
+    Rcpp::traits::input_parameter< std::string >::type type(typeSEXP);
+    Rcpp::traits::input_parameter< bool >::type row_major(row_majorSEXP);
+    Rcpp::traits::input_parameter< uint32_t >::type chunk_size(chunk_sizeSEXP);
+    Rcpp::traits::input_parameter< uint32_t >::type gzip_level(gzip_levelSEXP);
+    write_matrix_anndata_hdf5_dense_cpp(matrix, file, dataset, type, row_major, chunk_size, gzip_level);
+    return R_NilValue;
+END_RCPP
+}
 // read_hdf5_string_cpp
 std::vector<std::string> read_hdf5_string_cpp(std::string path, std::string group, uint32_t buffer_size);
 RcppExport SEXP _BPCells_read_hdf5_string_cpp(SEXP pathSEXP, SEXP groupSEXP, SEXP buffer_sizeSEXP) {
@@ -2650,6 +2666,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_BPCells_dims_matrix_anndata_hdf5_cpp", (DL_FUNC) &_BPCells_dims_matrix_anndata_hdf5_cpp, 3},
     {"_BPCells_iterate_matrix_anndata_hdf5_cpp", (DL_FUNC) &_BPCells_iterate_matrix_anndata_hdf5_cpp, 6},
     {"_BPCells_write_matrix_anndata_hdf5_cpp", (DL_FUNC) &_BPCells_write_matrix_anndata_hdf5_cpp, 8},
+    {"_BPCells_write_matrix_anndata_hdf5_dense_cpp", (DL_FUNC) &_BPCells_write_matrix_anndata_hdf5_dense_cpp, 7},
     {"_BPCells_read_hdf5_string_cpp", (DL_FUNC) &_BPCells_read_hdf5_string_cpp, 3},
     {"_BPCells_hdf5_group_exists_cpp", (DL_FUNC) &_BPCells_hdf5_group_exists_cpp, 2},
     {"_BPCells_hdf5_group_objnames_cpp", (DL_FUNC) &_BPCells_hdf5_group_objnames_cpp, 2},
diff --git a/r/src/bpcells-cpp/matrixIterators/H5DenseMatrixWriter.cpp b/r/src/bpcells-cpp/matrixIterators/H5DenseMatrixWriter.cpp
new file mode 100644
index 00000000..2d007de9
--- /dev/null
+++ b/r/src/bpcells-cpp/matrixIterators/H5DenseMatrixWriter.cpp
@@ -0,0 +1,149 @@
+// Copyright 2024 BPCells contributors
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#include "../arrayIO/hdf5.h"
+#include "../utils/filesystem_compat.h"
+
+#include "H5DenseMatrixWriter.h"
+
+namespace BPCells {
+
+template <typename T> class H5DenseMatrixWriter : public MatrixWriter<T> {
+  private:
+    HighFive::File h5file;
+    std::string dataset_path;
+    uint64_t chunk_size;
+    uint32_t gzip_level;
+
+    HighFive::DataType datatype = HighFive::create_datatype<T>();
+    bool row_major;
+
+    HighFive::DataSet createH5Matrix(uint64_t nrow,  uint64_t ncol) {
+      HighFive::SilenceHDF5 s;
+      
+      // Create a dataspace with initial shape and max shape
+      uint64_t nrow_h5 = nrow;
+      uint64_t ncol_h5 = ncol;
+      if (row_major) {
+          nrow_h5 = ncol;
+          ncol_h5 = nrow;
+      }
+      HighFive::DataSpace dataspace({nrow_h5, ncol_h5});
+
+      // Use chunking
+      HighFive::DataSetCreateProps props;
+      props.add(HighFive::Chunking(std::vector<hsize_t>{std::min(chunk_size, nrow_h5), std::min(chunk_size, ncol_h5)}));
+      if (gzip_level > 0) {
+          props.add(HighFive::Shuffle());
+          props.add(HighFive::Deflate(gzip_level));      
+      }
+      // Create the dataset
+      return h5file.createDataSet<T>(dataset_path, dataspace, props);
+    }
+
+  public:
+    H5DenseMatrixWriter(
+        HighFive::File h5file,
+        std::string dataset,
+        bool row_major,
+        uint64_t chunk_size = 1024,
+        uint32_t gzip_level = 0
+    )
+        : h5file(h5file)
+        , dataset_path(dataset)
+        , chunk_size(chunk_size)
+        , gzip_level(gzip_level)
+        , row_major(row_major) {}
+
+    void write(MatrixLoader<T> &mat_in, std::atomic<bool> *user_interrupt = NULL) override {
+
+        HighFive::DataSet h5dataset = createH5Matrix(mat_in.rows(), mat_in.cols());
+
+        bool loaded = false; // Any non-zero values has been loaded.
+        std::vector<T> val_buf(mat_in.rows());  // buffer for each column
+        while (mat_in.nextCol()) {
+            if (user_interrupt != NULL && *user_interrupt) return;
+            while (mat_in.load()) {
+                loaded = true;
+                uint32_t *row_data = mat_in.rowData();
+                T *val_data = mat_in.valData();
+                uint32_t capacity = mat_in.capacity();
+                for (uint32_t i = 0; i < capacity; i++) {
+                    val_buf[row_data[i]] = val_data[i];
+                }
+                if (user_interrupt != NULL && *user_interrupt) return;
+            }
+            if (loaded) {
+                if (row_major) {
+                    h5dataset.select({(uint64_t)mat_in.currentCol(), 0}, {1, val_buf.size()}).write_raw(val_buf.data(), datatype);
+                } else {
+                    h5dataset.select({0, (uint64_t)mat_in.currentCol()}, {val_buf.size(), 1}).write_raw(val_buf.data(), datatype);
+                }
+            }
+            for (auto &x : val_buf) {
+                x = 0;
+            }
+            loaded = false;
+        }
+        h5dataset.createAttribute("encoding-type", std::string("array"));
+        h5dataset.createAttribute("encoding-version", std::string("0.2.0"));
+    }
+};
+
+
+template <typename T>
+std::unique_ptr<MatrixWriter<T>> createAnnDataDenseMatrix(
+    std::string file,
+    std::string dataset,
+    bool row_major,
+    uint32_t chunk_size,
+    uint32_t gzip_level
+) {
+    HighFive::SilenceHDF5 s;
+
+    // Create the HDF5 file
+    std_fs::path path(file);
+    if (path.has_parent_path() && !std_fs::exists(path.parent_path())) {
+      std_fs::create_directories(path.parent_path());
+    }
+    HighFive::File h5file(file, HighFive::File::OpenOrCreate);
+    
+    return std::make_unique<H5DenseMatrixWriter<T>>(h5file, dataset, row_major, chunk_size, gzip_level);
+}
+
+// Explicit template instantiations
+template std::unique_ptr<MatrixWriter<uint32_t>> createAnnDataDenseMatrix<uint32_t>(
+    std::string file,
+    std::string dataset,
+    bool row_major,
+    uint32_t chunk_size,
+    uint32_t gzip_level
+);
+template std::unique_ptr<MatrixWriter<uint64_t>> createAnnDataDenseMatrix<uint64_t>(
+    std::string file,
+    std::string dataset,
+    bool row_major,
+    uint32_t chunk_size,
+    uint32_t gzip_level
+);
+template std::unique_ptr<MatrixWriter<float>> createAnnDataDenseMatrix<float>(
+    std::string file,
+    std::string dataset,
+    bool row_major,
+    uint32_t chunk_size,
+    uint32_t gzip_level
+);
+template std::unique_ptr<MatrixWriter<double>> createAnnDataDenseMatrix<double>(
+    std::string file,
+    std::string dataset,
+    bool row_major,
+    uint32_t chunk_size,
+    uint32_t gzip_level
+);
+
+} // namespace BPCells
diff --git a/r/src/bpcells-cpp/matrixIterators/H5DenseMatrixWriter.h b/r/src/bpcells-cpp/matrixIterators/H5DenseMatrixWriter.h
new file mode 100644
index 00000000..757baa0e
--- /dev/null
+++ b/r/src/bpcells-cpp/matrixIterators/H5DenseMatrixWriter.h
@@ -0,0 +1,35 @@
+// Copyright 2024 BPCells contributors
+// 
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#pragma once
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "MatrixIterator.h"
+
+namespace BPCells {
+
+/**
+ * @brief Write a matrix to an AnnData HDF5 file in dense format
+ * 
+ * @param file Path of the HDF5 file on disk
+ * @param dataset Path of the output dataset within the HDF5 file
+ * @param row_major If true, write output in transposed (row major) order. (Input is always interpeted as col major)
+ * @param chunk_size Chunk size used for HDF5 array storage
+ * @param gzip_level If non-zero, level of gzip compression to use while writing.
+ */
+template <typename T> std::unique_ptr<MatrixWriter<T>> createAnnDataDenseMatrix(
+    std::string file,
+    std::string dataset,
+    bool row_major,
+    uint32_t chunk_size,
+    uint32_t gzip_level
+);
+
+} // namespace BPCells
diff --git a/r/src/matrix_io.cpp b/r/src/matrix_io.cpp
index 6a0c3b8d..cd559880 100644
--- a/r/src/matrix_io.cpp
+++ b/r/src/matrix_io.cpp
@@ -18,6 +18,7 @@
 #include "bpcells-cpp/matrixIterators/StoredMatrixTransposeWriter.h"
 #include "bpcells-cpp/matrixIterators/StoredMatrixWriter.h"
 #include "bpcells-cpp/matrixIterators/StoredMatrixSparseColumn.h"
+#include "bpcells-cpp/matrixIterators/H5DenseMatrixWriter.h"
 
 #include "bpcells-cpp/arrayIO/binaryfile.h"
 #include "bpcells-cpp/arrayIO/hdf5.h"
@@ -932,6 +933,50 @@ void write_matrix_anndata_hdf5_cpp(
     }
 }
 
+template <typename T>
+void write_matrix_anndata_hdf5_dense_base(
+    SEXP matrix,
+    std::string file,
+    std::string dataset,
+    bool row_major,
+    uint32_t chunk_size,
+    uint32_t gzip_level
+) {
+    auto loader = take_unique_xptr<MatrixLoader<T>>(matrix);
+    loader->restart();
+
+    auto w = createAnnDataDenseMatrix<T>(file, dataset, row_major, chunk_size, gzip_level);
+    run_with_R_interrupt_check(&MatrixWriter<T>::write, w.get(), std::ref(*loader));
+    createAnnDataObsVarIfMissing(*loader, file, row_major, gzip_level);
+}
+
+// [[Rcpp::export]]
+void write_matrix_anndata_hdf5_dense_cpp(
+    SEXP matrix,
+    std::string file,
+    std::string dataset,
+    std::string type,
+    bool row_major,
+    uint32_t chunk_size,
+    uint32_t gzip_level
+) {
+    if (type == "uint32_t") {
+        write_matrix_anndata_hdf5_dense_base<uint32_t>(
+            matrix, file, dataset, row_major, chunk_size, gzip_level
+        );
+    } else if (type == "float") {
+        write_matrix_anndata_hdf5_dense_base<float>(
+            matrix, file, dataset, row_major, chunk_size, gzip_level
+        );
+    } else if (type == "double") {
+        write_matrix_anndata_hdf5_dense_base<double>(
+            matrix, file, dataset, row_major, chunk_size, gzip_level
+        );
+    } else {
+        throw std::runtime_error("write_matrix_anndata_hdf5_dense_cpp: unsupported type " + type);
+    }
+}
+
 // [[Rcpp::export]]
 std::vector<std::string>
 read_hdf5_string_cpp(std::string path, std::string group, uint32_t buffer_size) {
diff --git a/r/tests/testthat/test-matrix_io.R b/r/tests/testthat/test-matrix_io.R
index 0f3ca40f..3d69871d 100644
--- a/r/tests/testthat/test-matrix_io.R
+++ b/r/tests/testthat/test-matrix_io.R
@@ -317,6 +317,71 @@ test_that("AnnData write works", {
   expect_identical(mat_2, mat_2_res)
 })
 
+test_that("AnnData write dense matrix works", {
+  dir <- withr::local_tempdir()
+  mat_1 <- generate_sparse_matrix(10, 15)
+  rownames(mat_1) <- paste0("mat1_row", seq_len(nrow(mat_1)))
+  colnames(mat_1) <- NULL
+  mat_2 <- generate_sparse_matrix(10, 20)
+
+  mat_1_res <- write_matrix_anndata_hdf5_dense(as(mat_1, "IterableMatrix"), file.path(dir, "mat.h5ad")) %>%
+    as.matrix()
+  mat_2_res <- write_matrix_anndata_hdf5_dense(t(as(t(mat_2), "IterableMatrix")), file.path(dir, "mat.h5ad"), dataset = "varm/mat2") %>%
+    as.matrix()
+  expect_identical(rownames(mat_1_res), rownames(mat_1))
+  expect_identical(colnames(mat_1_res), as.character(seq_len(ncol(mat_1)) - 1L))
+
+  expect_identical(rownames(mat_2_res), rownames(mat_1))
+  expect_identical(colnames(mat_2_res), NULL)
+
+  dimnames(mat_1) <- NULL
+  dimnames(mat_2) <- NULL
+  dimnames(mat_1_res) <- NULL
+  dimnames(mat_2_res) <- NULL
+  expect_identical(as.matrix(mat_1), mat_1_res)
+  expect_identical(as.matrix(mat_2), mat_2_res)
+
+  # Test empty columns
+  mat_3 <- generate_sparse_matrix(10, 15)
+  mat_3[, 4] <- 0
+  mat_3_res <- write_matrix_anndata_hdf5_dense(as(mat_3, "IterableMatrix"), file.path(dir, "mat_3.h5ad")) %>%
+    as.matrix()
+  expect_identical(as.matrix(mat_3), mat_3_res)
+
+  # Test empty columns
+  mat_3 <- generate_sparse_matrix(10, 15)
+  mat_3[, 4:6] <- 0
+  mat_3_res <- write_matrix_anndata_hdf5_dense(as(mat_3, "IterableMatrix"), file.path(dir, "mat_4.h5ad")) %>%
+    as.matrix()
+  expect_identical(as.matrix(mat_3), mat_3_res)
+  
+  m <- matrix(0, nrow = 3, ncol = 4)
+  m[2, 2] <- 1
+  m[3, 4] <- 1
+  rownames(m) <- paste0("row", seq_len(nrow(m)))
+  colnames(m) <- paste0("col", seq_len(ncol(m)))
+  mat <- m |> as("dgCMatrix") |> as("IterableMatrix")
+  ans <- write_matrix_anndata_hdf5_dense(mat, file.path(dir, "zeros.h5"))
+  expect_identical(as.matrix(mat), as.matrix(ans))
+
+  # Create a dense IterableMatrix
+  mat_3 <- as(mat_1, "IterableMatrix") %>%
+    multiply_cols(1 / Matrix::colSums(mat_1)) %>%
+    log1p()
+  stats <- matrix_stats(mat_3, row_stats = "variance")
+  gene_means <- stats$row_stats["mean", ]
+  gene_vars <- stats$row_stats["variance", ]
+  mat_3 <- (mat_3 - gene_means) / gene_vars
+  rownames(mat_3) <- paste0("mat3_row", seq_len(nrow(mat_3)))
+  colnames(mat_3) <- paste0("mat3_col", seq_len(ncol(mat_3)))
+  mat_3_res <- write_matrix_anndata_hdf5_dense(mat_3, file.path(dir, "mat2.h5ad")) %>%
+    as.matrix()
+  expect_identical(as.matrix(mat_3), mat_3_res)
+  mat_3_res <- write_matrix_anndata_hdf5_dense(t(mat_3), file.path(dir, "mat3.h5ad")) %>%
+    as.matrix()
+  expect_identical(as.matrix(t(mat_3)), mat_3_res)
+})
+
 test_that("AnnData types round-trip", {
   dir <- withr::local_tempdir()
   mat <- generate_sparse_matrix(10, 15)

From bb7f5e24f22a115feed2a0ad9f6b12f1ea8b9169 Mon Sep 17 00:00:00 2001
From: Ben Parks <bnprks+git@gmail.com>
Date: Fri, 10 Jan 2025 16:51:58 -0800
Subject: [PATCH 039/142] Update NEWS

---
 r/NEWS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/r/NEWS.md b/r/NEWS.md
index 455c04c5..deefee46 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -10,6 +10,7 @@ Contributions welcome :)
 
 ## Features
 - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166)
+- Add normalization helper functions `normalize_log()` and `normalize_tfidf()` (pull request #168)
 
 ## Bug-fixes
 - Fix error message printing when MACS crashes during `call_peaks_macs()` (pull request #175) 

From 765a0cb82ed30503c565ad4b374733b85ba33135 Mon Sep 17 00:00:00 2001
From: Ben Parks <bnprks+git@gmail.com>
Date: Fri, 10 Jan 2025 17:00:53 -0800
Subject: [PATCH 040/142] Update docs

---
 r/R/transforms.R   | 2 +-
 r/man/normalize.Rd | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/r/R/transforms.R b/r/R/transforms.R
index e79c404d..05759f9e 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -928,7 +928,7 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 # Normalizations
 #################
 
-#' Normalization recipes
+#' Normalization helper functions
 #' 
 #' Apply standard normalizations to a `(features x cells)` counts matrix.
 #' 
diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd
index f4bde193..ac53f2c0 100644
--- a/r/man/normalize.Rd
+++ b/r/man/normalize.Rd
@@ -3,7 +3,7 @@
 \name{normalize_log}
 \alias{normalize_log}
 \alias{normalize_tfidf}
-\title{Normalization recipes}
+\title{Normalization helper functions}
 \usage{
 normalize_log(mat, scale_factor = 10000, threads = 1L)
 

From 3a332096245def0b6c2a1b8d845d714074b6a974 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 13 Jan 2025 16:46:14 -0800
Subject: [PATCH 041/142] [r] add partial args to normalizations

---
 r/R/transforms.R                          | 17 +++++++++++++++++
 r/tests/testthat/test-matrix_transforms.R |  3 +++
 2 files changed, 20 insertions(+)

diff --git a/r/R/transforms.R b/r/R/transforms.R
index 05759f9e..b097e898 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -943,6 +943,14 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog`
 #' @export
 normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
+  if (rlang::is_missing(mat)) {
+    return(
+      purrr::partial(
+        normalize_log, 
+        scale_factor = scale_factor, threads = threads
+      )
+    )
+  }
   assert_is(mat, "IterableMatrix")
   assert_is_numeric(scale_factor)
   assert_greater_than_zero(scale_factor)
@@ -963,6 +971,15 @@ normalize_tfidf <- function(
   mat, feature_means = NULL,
   scale_factor = 1e4, threads = 1L
 ) {
+  if (rlang::is_missing(mat)) {
+    return(
+      purrr::partial(
+        normalize_tfidf, 
+        feature_means = feature_means, scale_factor = scale_factor, 
+        threads = threads
+      )
+    )
+  }
   assert_is(mat, "IterableMatrix")
   assert_is_wholenumber(threads)
   # If feature means are passed in, only need to calculate term frequency
diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R
index 67641e54..24cd9c23 100644
--- a/r/tests/testthat/test-matrix_transforms.R
+++ b/r/tests/testthat/test-matrix_transforms.R
@@ -367,6 +367,8 @@ test_that("tf-idf normalization works", {
     res <- normalize_tfidf(m2)
     expect_equal(res %>% as("dgCMatrix"), res_dgc, tolerance = 1e-6)
     res_with_row_means <- normalize_tfidf(m2, feature_means = row_means)
+    res_with_row_means_partial <- normalize_tfidf(feature_means = row_means)(m2)
+    expect_equal(res_with_row_means, res_with_row_means_partial)
     expect_identical(res, res_with_row_means)
 
     res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled)
@@ -386,5 +388,6 @@ test_that("normalize_log works", {
     
     # Test that changing scale factor works
     res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix")
+    res_2_partial <- as(normalize_log(scale_factor = 1e5)(m2), "dgCMatrix")
     expect_equal(res_2, log1p(res_dgc*1e5), tolerance = 1e-6)
 })
\ No newline at end of file

From 891868c0c9b2d902f9781d34012d007e56b53d31 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 13 Jan 2025 19:18:45 -0800
Subject: [PATCH 042/142] [r] create mechanism for partial calls on explicit
 args

---
 r/R/transforms.R | 13 ++++++-------
 r/R/utils.R      | 16 ++++++++++++++++
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/r/R/transforms.R b/r/R/transforms.R
index b097e898..02ab51b8 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -943,11 +943,11 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog`
 #' @export
 normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
+  # browser()
   if (rlang::is_missing(mat)) {
     return(
-      purrr::partial(
-        normalize_log, 
-        scale_factor = scale_factor, threads = threads
+      partial_explicit(
+        normalize_log, scale_factor = scale_factor, threads = threads
       )
     )
   }
@@ -973,10 +973,9 @@ normalize_tfidf <- function(
 ) {
   if (rlang::is_missing(mat)) {
     return(
-      purrr::partial(
-        normalize_tfidf, 
-        feature_means = feature_means, scale_factor = scale_factor, 
-        threads = threads
+      partial_explicit(
+        normalize_tfidf, feature_means = feature_means, 
+        scale_factor = scale_factor, threads = threads
       )
     )
   }
diff --git a/r/R/utils.R b/r/R/utils.R
index 4ea62d15..784f6106 100644
--- a/r/R/utils.R
+++ b/r/R/utils.R
@@ -56,4 +56,20 @@ log_progress <- function(msg, add_timestamp = TRUE){
   } else {
     message(msg)
   }
+}
+
+# Helper function to create partial explicit functions
+# This builds upon purrr::partial by allowing for nested partial calls, where each partial call
+# only does partial application of the arguments that were explicitly provided.
+partial_explicit <- function(fn, ...) {
+  args <- rlang::enquos(...)
+  evaluated_args <- purrr::map(args, rlang::eval_tidy)
+  # Fetch the default arguments from the function definition
+  default_args <- formals(fn)
+  # Keep only explicitly provided arguments that were evaluated
+  # where the values are different from the default arguments
+  explicitly_passed_args <- evaluated_args[names(evaluated_args) %in% names(default_args) & 
+                                            !purrr::map2_lgl(evaluated_args, default_args[names(evaluated_args)], identical)]
+  # Return a partially applied version of the function using evaluated arguments
+  return(purrr::partial(fn, !!!explicitly_passed_args))
 }
\ No newline at end of file

From 00922d71bdc0f0bfd388e7f4e76cb798703474c8 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 13 Jan 2025 21:19:46 -0800
Subject: [PATCH 043/142] [r] add partial calls, update feature selection docs

---
 r/R/singlecell_utils.R                   | 49 ++++++++-------
 r/man/feature_selection.Rd               | 78 ++++++++++++++++++++++++
 r/man/select_features_by_dispersion.Rd   | 42 -------------
 r/man/select_features_by_mean.Rd         | 34 -----------
 r/man/select_features_by_variance.Rd     | 41 -------------
 r/pkgdown/_pkgdown.yml                   |  2 +
 r/tests/testthat/test-singlecell_utils.R |  8 ++-
 7 files changed, 113 insertions(+), 141 deletions(-)
 create mode 100644 r/man/feature_selection.Rd
 delete mode 100644 r/man/select_features_by_dispersion.Rd
 delete mode 100644 r/man/select_features_by_mean.Rd
 delete mode 100644 r/man/select_features_by_variance.Rd

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 7db9fd2f..25b7df5a 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -11,34 +11,42 @@
 # Feature selection
 #################
 
-#' Get the most variable features within a matrix.
+#' Feature selection functions
+#' 
+#' Apply a feature selection method to a `(features x cells)` matrix.
+#' @rdname feature_selection
 #' @param mat (IterableMatrix) dimensions features x cells
 #' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
 #' all features will be returned.
 #' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed.
 #' @param threads (integer) Number of threads to use.
 #' @returns
-#' Return a dataframe with the following columns, sorted descending by variance:
+#' Return a dataframe with the following columns, sorted descending by score:
 #' - `names`: Feature name.
-#' - `score`: Variance of the feature.
+#' - `score`: Scoring of the feature, depending on the method used.  
 #' - `highly_variable`: Logical vector of whether the feature is highly variable.
+#'
+#' Each different feature selection method will have a different scoring method:
+#' - `select_features_by_variance`: Score representing variance of each feature.
 #' @details 
-#' Calculate using the following process:
+#' `select_features_by_variance` Calculates the variance of each feature using the following process:
 #'  1. Perform an optional term frequency + log normalization, for each feature.
-#'  2. Find `num_feats` features with the highest variance across clusters.
+#'  2. Find `num_feats` features with the highest variance.
 #' @export
 select_features_by_variance <- function(
   mat, num_feats = 25000, 
   normalize = normalize_log,
   threads = 1L
 ) {
+  if (rlang::is_missing(mat)) {
+    return(purrr::partial(select_features_by_variance, num_feats = num_feats, normalize = normalize, threads = threads))
+  }
   assert_is(mat, "IterableMatrix")
   assert_greater_than_zero(num_feats)
   assert_is_wholenumber(num_feats)
   assert_len(num_feats, 1)
   assert_is(num_feats, "numeric")
   num_feats <- min(max(num_feats, 0), nrow(mat))
-
   if (!is.null(normalize)) mat <- normalize(mat, threads = threads)
   features_df <- tibble::tibble(
     names = rownames(mat),
@@ -50,15 +58,11 @@ select_features_by_variance <- function(
 }
 
 
-#' Get the features with the highest dispersion within a matrix.
+#' @rdname feature_selection
 #' @returns
-#' Return a dataframe with the following columns, sorted descending by dispersion:
-#'   - `names`: Feature name.
-#'   - `score`: Variance of the feature.
-#'   - `highly_variable`: Logical vector of whether the feature is highly variable.
-#' @inheritParams select_features_by_variance
+#'  - `select_features_by_dispersion`: Score representing the dispersion of each feature.
 #' @details 
-#' Calculate using the following process:
+#' `select_features_by_dispersion` calculates the dispersion of each feature using the following process:
 #'  1. Perform an optional term frequency + log normalization, for each feature.
 #'  2. Find the dispersion (variance/mean) of each feature.
 #'  3. Find `num_feats` features with the highest dispersion.
@@ -67,6 +71,9 @@ select_features_by_dispersion <- function(
   normalize = NULL,
   threads = 1L
 ) {
+  if (rlang::is_missing(mat)) {
+    return(partial_explicit(select_features_by_dispersion, num_feats = num_feats, normalize = normalize, threads = threads))
+  }
   assert_is(mat, "IterableMatrix")
   assert_greater_than_zero(num_feats)
   assert_is_wholenumber(num_feats)
@@ -86,21 +93,18 @@ select_features_by_dispersion <- function(
 }
 
 
-#' Get the top features from a matrix, based on the mean accessibility of each feature.
-#' @param num_feats Number of features to deem as highly accessible.  If the number is higher than the number of features in the matrix,
-#' all features will be returned.
-#' @inheritParams select_features_by_variance
+#' @rdname feature_selection
 #' @returns
-#' Return a dataframe with the following columns, sorted descending by mean accessibility:
-#'   - `names`: Feature name.
-#'   - `score`: Binarize sum of each feature.
-#'   - `highly_variable`: Logical vector of whether the feature is highly accessible by mean accessibility.
+#' - `select_features_by_mean`: Score representing the mean accessibility of each feature.
 #' @details 
-#' Calculate using the following process:
+#' `select_features_by_mean` calculates the mean accessibility of each feature using the following process:
 #' 1. Get the sum of each binarized feature.
 #' 2. Find `num_feats` features with the highest accessibility.
 #' @export
 select_features_by_mean <- function(mat, num_feats = 25000, threads = 1L) {
+  if (rlang::is_missing(mat)) {
+    return(partial_explicit(select_features_by_mean, num_feats = num_feats, threads = threads))
+  }
   assert_is(mat, "IterableMatrix")
   assert_is_wholenumber(num_feats)
   assert_greater_than_zero(num_feats)
@@ -420,7 +424,6 @@ marker_features <- function(mat, groups, method="wilcoxon") {
     )
 }
 
-                   
 #' Aggregate counts matrices by cell group or feature.
 #'
 #' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
new file mode 100644
index 00000000..c57a8e40
--- /dev/null
+++ b/r/man/feature_selection.Rd
@@ -0,0 +1,78 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{select_features_by_variance}
+\alias{select_features_by_variance}
+\alias{select_features_by_dispersion}
+\alias{select_features_by_mean}
+\title{Feature selection functions}
+\usage{
+select_features_by_variance(
+  mat,
+  num_feats = 25000,
+  normalize = normalize_log,
+  threads = 1L
+)
+
+select_features_by_dispersion(
+  mat,
+  num_feats = 25000,
+  normalize = NULL,
+  threads = 1L
+)
+
+select_features_by_mean(mat, num_feats = 25000, threads = 1L)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
+all features will be returned.}
+
+\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+Return a dataframe with the following columns, sorted descending by score:
+\itemize{
+\item \code{names}: Feature name.
+\item \code{score}: Scoring of the feature, depending on the method used.
+\item \code{highly_variable}: Logical vector of whether the feature is highly variable.
+}
+
+Each different feature selection method will have a different scoring method:
+\itemize{
+\item \code{select_features_by_variance}: Score representing variance of each feature.
+}
+
+\itemize{
+\item \code{select_features_by_dispersion}: Score representing the dispersion of each feature.
+}
+
+\itemize{
+\item \code{select_features_by_mean}: Score representing the mean accessibility of each feature.
+}
+}
+\description{
+Apply a feature selection method to a \verb{(features x cells)} matrix.
+}
+\details{
+\code{select_features_by_variance} Calculates the variance of each feature using the following process:
+\enumerate{
+\item Perform an optional term frequency + log normalization, for each feature.
+\item Find \code{num_feats} features with the highest variance.
+}
+
+\code{select_features_by_dispersion} calculates the dispersion of each feature using the following process:
+\enumerate{
+\item Perform an optional term frequency + log normalization, for each feature.
+\item Find the dispersion (variance/mean) of each feature.
+\item Find \code{num_feats} features with the highest dispersion.
+}
+
+\code{select_features_by_mean} calculates the mean accessibility of each feature using the following process:
+\enumerate{
+\item Get the sum of each binarized feature.
+\item Find \code{num_feats} features with the highest accessibility.
+}
+}
diff --git a/r/man/select_features_by_dispersion.Rd b/r/man/select_features_by_dispersion.Rd
deleted file mode 100644
index 2835c9a8..00000000
--- a/r/man/select_features_by_dispersion.Rd
+++ /dev/null
@@ -1,42 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/singlecell_utils.R
-\name{select_features_by_dispersion}
-\alias{select_features_by_dispersion}
-\title{Get the features with the highest dispersion within a matrix.}
-\usage{
-select_features_by_dispersion(
-  mat,
-  num_feats = 25000,
-  normalize = NULL,
-  threads = 1L
-)
-}
-\arguments{
-\item{mat}{(IterableMatrix) dimensions features x cells}
-
-\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
-all features will be returned.}
-
-\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
-
-\item{threads}{(integer) Number of threads to use.}
-}
-\value{
-Return a dataframe with the following columns, sorted descending by dispersion:
-\itemize{
-\item \code{names}: Feature name.
-\item \code{score}: Variance of the feature.
-\item \code{highly_variable}: Logical vector of whether the feature is highly variable.
-}
-}
-\description{
-Get the features with the highest dispersion within a matrix.
-}
-\details{
-Calculate using the following process:
-\enumerate{
-\item Perform an optional term frequency + log normalization, for each feature.
-\item Find the dispersion (variance/mean) of each feature.
-\item Find \code{num_feats} features with the highest dispersion.
-}
-}
diff --git a/r/man/select_features_by_mean.Rd b/r/man/select_features_by_mean.Rd
deleted file mode 100644
index c05b0acb..00000000
--- a/r/man/select_features_by_mean.Rd
+++ /dev/null
@@ -1,34 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/singlecell_utils.R
-\name{select_features_by_mean}
-\alias{select_features_by_mean}
-\title{Get the top features from a matrix, based on the mean accessibility of each feature.}
-\usage{
-select_features_by_mean(mat, num_feats = 25000, threads = 1L)
-}
-\arguments{
-\item{mat}{(IterableMatrix) dimensions features x cells}
-
-\item{num_feats}{Number of features to deem as highly accessible.  If the number is higher than the number of features in the matrix,
-all features will be returned.}
-
-\item{threads}{(integer) Number of threads to use.}
-}
-\value{
-Return a dataframe with the following columns, sorted descending by mean accessibility:
-\itemize{
-\item \code{names}: Feature name.
-\item \code{score}: Binarize sum of each feature.
-\item \code{highly_variable}: Logical vector of whether the feature is highly accessible by mean accessibility.
-}
-}
-\description{
-Get the top features from a matrix, based on the mean accessibility of each feature.
-}
-\details{
-Calculate using the following process:
-\enumerate{
-\item Get the sum of each binarized feature.
-\item Find \code{num_feats} features with the highest accessibility.
-}
-}
diff --git a/r/man/select_features_by_variance.Rd b/r/man/select_features_by_variance.Rd
deleted file mode 100644
index b7cc375f..00000000
--- a/r/man/select_features_by_variance.Rd
+++ /dev/null
@@ -1,41 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/singlecell_utils.R
-\name{select_features_by_variance}
-\alias{select_features_by_variance}
-\title{Get the most variable features within a matrix.}
-\usage{
-select_features_by_variance(
-  mat,
-  num_feats = 25000,
-  normalize = normalize_log,
-  threads = 1L
-)
-}
-\arguments{
-\item{mat}{(IterableMatrix) dimensions features x cells}
-
-\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
-all features will be returned.}
-
-\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
-
-\item{threads}{(integer) Number of threads to use.}
-}
-\value{
-Return a dataframe with the following columns, sorted descending by variance:
-\itemize{
-\item \code{names}: Feature name.
-\item \code{score}: Variance of the feature.
-\item \code{highly_variable}: Logical vector of whether the feature is highly variable.
-}
-}
-\description{
-Get the most variable features within a matrix.
-}
-\details{
-Calculate using the following process:
-\enumerate{
-\item Perform an optional term frequency + log normalization, for each feature.
-\item Find \code{num_feats} features with the highest variance across clusters.
-}
-}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 30cf2eeb..24c0fb73 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -60,6 +60,7 @@ articles:
   - "web-only/programming-philosophy"
   - "web-only/developer-notes"
   - "web-only/manuscript-draft"
+  - "web-only/manuscript-draft"
 
 # cosmo, flatly, united, sandstone all look reasonable
 # pulse, lumen, zephyr
@@ -146,6 +147,7 @@ reference:
 - subtitle: "Dimensionality reduction"
 - contents:
   - normalize_log
+  - select_features_by_variance
 - subtitle: "Clustering"
 - contents:
   - knn_hnsw
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index bc5a0e03..b70c42af 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -25,11 +25,17 @@ test_that("select_features works general case", {
     expect_equal(sum(res$highly_variable), 10) # Only 10 features marked as highly variable
     expect_setequal(res$names, rownames(m1))
     res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows
-    res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) 
+    res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100))
+    res_feats_partial <- get(fn)(num_feats = 100)(m1)
+    expect_identical(res_feats_equal_rows, res_feats_partial)
     expect_identical(res_more_feats_than_rows, res_feats_equal_rows)
     if (fn == "select_features_by_variance") {
       # Check that normalization actually does something
       res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL))
+      # Check that we can do partial functions on normalization too
+      res_norm_partial <- do.call(fn, list(m1, num_feats = 10, normalize = purrr::partial(normalize_log(scale = 1e3, threads = 1L))))
+      res_norm_implicit_partial <- select_features_by_variance(normalize = normalize_log(scale_factor = 1e3), num_feats = 10)(m1)
+      expect_identical(res_norm_partial, res_norm_implicit_partial)
       expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score))
     }
   }

From 4e27f5dc494516f45ac8ab5a2d7cd54a0ed167db Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 4 Nov 2024 15:14:26 -0800
Subject: [PATCH 044/142] [r] add lsi, variable feature selection

---
 r/NAMESPACE                              |   2 +
 r/R/singlecell_utils.R                   | 106 +++++++++++++++++++++++
 r/man/highly_variable_features.Rd        |  36 ++++++++
 r/pkgdown/_pkgdown.yml                   |   1 -
 r/tests/testthat/test-singlecell_utils.R |  62 +++++++++++++
 5 files changed, 206 insertions(+), 1 deletion(-)
 create mode 100644 r/man/highly_variable_features.Rd

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 4892b6d5..4a366a02 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -53,6 +53,7 @@ export(gene_score_archr)
 export(gene_score_tiles_archr)
 export(gene_score_weights_archr)
 export(get_trackplot_height)
+export(highly_variable_features)
 export(import_matrix_market)
 export(import_matrix_market_10x)
 export(knn_annoy)
@@ -60,6 +61,7 @@ export(knn_hnsw)
 export(knn_to_geodesic_graph)
 export(knn_to_snn_graph)
 export(log1p_slow)
+export(lsi)
 export(marker_features)
 export(match_gene_symbol)
 export(matrix_stats)
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 25b7df5a..aa87a023 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -479,4 +479,110 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 0L) {
     }
   }
   return(res)
+}
+
+#' Perform latent semantic indexing (LSI) on a matrix.
+#' @param mat (IterableMatrix) dimensions features x cells
+#' @param n_dimensions (integer) Number of dimensions to keep during PCA.
+#' @param scale_factor (integer) Scale factor for the tf-idf log transform.
+#' @param save_in_memory (logical) If TRUE, save the log(tf-idf) matrix in memory.  
+#' If FALSE, save to a temporary location in disk.  Saving in memory will result in faster downstream operations,
+#' but will require in higher memory usage.  Comparison of memory usage and speed is in the details section.
+#' @param threads (integer) Number of threads to use.
+#' @return dgCMatrix of shape (n_dimensions, ncol(mat)).
+#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
+#' 
+#' ** Saving in memory vs disk: **
+#' Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix.
+#' This is done to prevent re-calculation of queued operations during PCA optimization.
+#' 
+#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
+#' - Saving in memory: 233 MB memory usage, 22.7 seconds runtime
+#' - Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime
+#' 
+#' @export
+lsi <- function(mat, n_dimensions = 50L, scale_factor = 1e4, save_in_memory = FALSE, threads = 1L) {
+  assert_is(mat, "IterableMatrix")
+  assert_is_wholenumber(n_dimensions)
+  assert_len(n_dimensions, 1)
+  assert_greater_than_zero(n_dimensions)
+  assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
+  assert_is_wholenumber(threads)
+
+  # log(tf-idf) transform
+  npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h`
+  tf <- mat %>% multiply_cols(1 / npeaks)
+  idf_ <- ncol(mat) / rowSums(mat)
+  mat_tfidf <- tf %>% multiply_rows(idf_)
+  mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
+  # Save to prevent re-calculation of queued operations
+  if (save_in_memory) {
+    mat_log_tfidf <- write_matrix_memory(mat_log_tfidf, compress = FALSE)
+  } else {
+    mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE)
+  } 
+  # Z-score normalization
+  cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance", threads = threads)$col_stats
+  cell_means <- cell_peak_stats["mean",]
+  cell_vars <- cell_peak_stats["variance",]
+  mat_lsi_norm <- mat_log_tfidf %>%
+    add_cols(-cell_means) %>%
+    multiply_cols(1 / cell_vars)
+  # Run pca
+  svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions, threads = threads)
+  pca_res <- t(svd_attr_$u) %*% mat_lsi_norm
+  return(pca_res)
+}
+
+#' Get the most variable features within a matrix
+#' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
+#' ll features will be returned.
+#' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
+#' and if the number of features
+#' within a bin is less than 2, the dispersion is set to 1.
+#' @returns IterableMatrix subset of the most variable features.
+#' @inheritParams lsi
+#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
+#' 
+#' Calculate using the following process:
+#'  1. Calculate the dispersion of each feature (variance / mean)
+#'  2. Log normalize dispersion and mean
+#'  3. Bin the features by their means, and normalize dispersion within each bin
+#' @export
+highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) {
+  assert_is(mat, "IterableMatrix")
+  assert_greater_than_zero(num_feats)
+  assert_is_wholenumber(num_feats)
+  assert_len(num_feats, 1)
+  assert_is_wholenumber(n_bins)
+  assert_len(n_bins, 1)
+  assert_greater_than_zero(n_bins)
+  if (nrow(mat) <= num_feats) {
+    log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
+    return(mat)
+  }
+  
+  feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats["mean", ]
+  feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats["variance", ]
+  feature_means[feature_means == 0] <- 1e-12
+  feature_dispersion <- feature_vars / feature_means
+  feature_dispersion[feature_dispersion == 0] <- NA
+  feature_dispersion <- log(feature_dispersion)
+  feature_means <- log1p(feature_means)
+  mean_bins <- cut(feature_means, n_bins, labels = FALSE)
+  
+  bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x, na.rm = TRUE))
+  bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x, na.rm = TRUE))
+  # Set feats that are in bins with only one feat to have a norm dispersion of 1
+  one_gene_bin <- is.na(bin_sd)
+  bin_sd[one_gene_bin] <- bin_mean[one_gene_bin]
+  bin_mean[one_gene_bin] <- 0
+  # map mean_bins indices to bin_stats
+  # Do a character search as bins without features mess up numeric indexing
+  feature_dispersion_norm <- (feature_dispersion - bin_mean[as.character(mean_bins)]) / bin_sd[as.character(mean_bins)]
+  names(feature_dispersion_norm) <- names(feature_dispersion)
+  feature_dispersion_norm <- sort(feature_dispersion_norm) # sorting automatically removes NA values
+  if (length(feature_dispersion_norm) < num_feats) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all non-zero features", length(feature_dispersion_norm), num_feats))
+  variable_features_ <- feature_dispersion_norm[max(1, (length(feature_dispersion_norm) - num_feats + 1)):length(feature_dispersion_norm)]
+  return(mat[names(variable_features_), ])
 }
\ No newline at end of file
diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd
new file mode 100644
index 00000000..54e67599
--- /dev/null
+++ b/r/man/highly_variable_features.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{highly_variable_features}
+\alias{highly_variable_features}
+\title{Get the most variable features within a matrix}
+\usage{
+highly_variable_features(mat, num_feats, n_bins, threads = 1L)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
+ll features will be returned.}
+
+\item{n_bins}{(integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin,
+and if the number of features
+within a bin is less than 2, the dispersion is set to 1.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+IterableMatrix subset of the most variable features.
+}
+\description{
+Get the most variable features within a matrix
+}
+\details{
+The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
+
+Calculate using the following process:
+\enumerate{
+\item Calculate the dispersion of each feature (variance / mean)
+\item Log normalize dispersion and mean
+\item Bin the features by their means, and normalize dispersion within each bin
+}
+}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 24c0fb73..9c842b06 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -60,7 +60,6 @@ articles:
   - "web-only/programming-philosophy"
   - "web-only/developer-notes"
   - "web-only/manuscript-draft"
-  - "web-only/manuscript-draft"
 
 # cosmo, flatly, united, sandstone all look reasonable
 # pulse, lumen, zephyr
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index b70c42af..7ac82b3c 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -17,6 +17,10 @@ generate_dense_matrix <- function(nrow, ncol) {
   m <- matrix(runif(nrow * ncol), nrow = nrow)
 }
 
+generate_dense_matrix <- function(nrow, ncol) {
+  m <- matrix(runif(nrow * ncol), nrow = nrow)
+}
+
 test_that("select_features works general case", {
   m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix")
   for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) {
@@ -42,6 +46,25 @@ test_that("select_features works general case", {
 })
 
 
+test_that("select_features works general case", {
+  m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix")
+  for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) {
+    res <- do.call(fn, list(m1, num_feats = 10))
+    expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting
+    expect_equal(sum(res$highly_variable), 10) # Only 10 features marked as highly variable
+    expect_setequal(res$names, rownames(m1))
+    res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows
+    res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) 
+    expect_identical(res_more_feats_than_rows, res_feats_equal_rows)
+    if (fn != "select_features_by_mean") {
+      # Check that normalization actually does something
+      res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL))
+      expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score))
+    }
+  }
+})
+
+
 test_that("Wilcoxon rank sum works (small)", {
     x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46)
     y <- c(1.15, 0.88, 0.90, 0.74, 1.21)
@@ -232,3 +255,42 @@ test_that("LSI works", {
     expect_equal(lsi_res, lsi_res_proj)
 })
 
+
+
+test_that("Feature selection by bin variance works", {
+    mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix")
+    # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
+    res_table <- select_features_by_binned_dispersion(mat, num_feats = 10, n_bins = 5, threads = 1)
+    res_table_t <- select_features_by_binned_dispersion(t(mat), num_feats = 10, n_bins = 5, threads = 1)
+    res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) 
+    res <- mat[res_feats,]
+    res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(names)
+    res_t <- t(mat[,res_feats_t])
+    
+    expect_equal(nrow(res), 10)
+    expect_equal(ncol(res), 26)
+    expect_equal(nrow(res_t), 10)
+    expect_equal(ncol(res_t), 500)
+})
+
+test_that("LSI works", {
+    mat <- matrix(runif(240), nrow=10) %>% as("dgCMatrix") %>% as("IterableMatrix")
+    rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
+    colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
+    # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to ArchR
+    lsi_res_obj <- LSI(mat, n_dimensions = 5)
+    lsi_res_t_obj <- LSI(t(mat), n_dimensions = 5)
+    # Also check partial args
+    lsi_res_obj_partial <- LSI(n_dimensions = 5)(mat)
+    lsi_res <- lsi_res_obj$cell_embeddings
+    lsi_res_t <- lsi_res_t_obj$cell_embeddings
+    # Check that projection results in the same output if used on the same input matrix
+    lsi_res_proj <- project(lsi_res_obj, mat)
+
+    expect_equal(nrow(lsi_res), 5)
+    expect_equal(ncol(lsi_res), ncol(mat))
+    expect_equal(nrow(lsi_res_t), 5)
+    expect_equal(ncol(lsi_res_t), nrow(mat))
+    expect_equal(lsi_res_obj, lsi_res_obj_partial)
+    expect_equal(lsi_res, lsi_res_proj)
+})

From ebebad470160cfbbd1182fc19863cd712203e853 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 8 Jan 2025 17:30:41 -0800
Subject: [PATCH 045/142] [r] update LSI to use norm, feature selection helpers

---
 r/NAMESPACE                              |  2 --
 r/man/highly_variable_features.Rd        | 36 --------------------
 r/man/select_features_by_dispersion.Rd   | 42 ++++++++++++++++++++++++
 r/tests/testthat/test-singlecell_utils.R | 19 -----------
 4 files changed, 42 insertions(+), 57 deletions(-)
 delete mode 100644 r/man/highly_variable_features.Rd
 create mode 100644 r/man/select_features_by_dispersion.Rd

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 4a366a02..4892b6d5 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -53,7 +53,6 @@ export(gene_score_archr)
 export(gene_score_tiles_archr)
 export(gene_score_weights_archr)
 export(get_trackplot_height)
-export(highly_variable_features)
 export(import_matrix_market)
 export(import_matrix_market_10x)
 export(knn_annoy)
@@ -61,7 +60,6 @@ export(knn_hnsw)
 export(knn_to_geodesic_graph)
 export(knn_to_snn_graph)
 export(log1p_slow)
-export(lsi)
 export(marker_features)
 export(match_gene_symbol)
 export(matrix_stats)
diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd
deleted file mode 100644
index 54e67599..00000000
--- a/r/man/highly_variable_features.Rd
+++ /dev/null
@@ -1,36 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/singlecell_utils.R
-\name{highly_variable_features}
-\alias{highly_variable_features}
-\title{Get the most variable features within a matrix}
-\usage{
-highly_variable_features(mat, num_feats, n_bins, threads = 1L)
-}
-\arguments{
-\item{mat}{(IterableMatrix) dimensions features x cells}
-
-\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
-ll features will be returned.}
-
-\item{n_bins}{(integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin,
-and if the number of features
-within a bin is less than 2, the dispersion is set to 1.}
-
-\item{threads}{(integer) Number of threads to use.}
-}
-\value{
-IterableMatrix subset of the most variable features.
-}
-\description{
-Get the most variable features within a matrix
-}
-\details{
-The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
-
-Calculate using the following process:
-\enumerate{
-\item Calculate the dispersion of each feature (variance / mean)
-\item Log normalize dispersion and mean
-\item Bin the features by their means, and normalize dispersion within each bin
-}
-}
diff --git a/r/man/select_features_by_dispersion.Rd b/r/man/select_features_by_dispersion.Rd
new file mode 100644
index 00000000..2835c9a8
--- /dev/null
+++ b/r/man/select_features_by_dispersion.Rd
@@ -0,0 +1,42 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{select_features_by_dispersion}
+\alias{select_features_by_dispersion}
+\title{Get the features with the highest dispersion within a matrix.}
+\usage{
+select_features_by_dispersion(
+  mat,
+  num_feats = 25000,
+  normalize = NULL,
+  threads = 1L
+)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
+all features will be returned.}
+
+\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+Return a dataframe with the following columns, sorted descending by dispersion:
+\itemize{
+\item \code{names}: Feature name.
+\item \code{score}: Variance of the feature.
+\item \code{highly_variable}: Logical vector of whether the feature is highly variable.
+}
+}
+\description{
+Get the features with the highest dispersion within a matrix.
+}
+\details{
+Calculate using the following process:
+\enumerate{
+\item Perform an optional term frequency + log normalization, for each feature.
+\item Find the dispersion (variance/mean) of each feature.
+\item Find \code{num_feats} features with the highest dispersion.
+}
+}
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 7ac82b3c..e2033300 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -46,25 +46,6 @@ test_that("select_features works general case", {
 })
 
 
-test_that("select_features works general case", {
-  m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix")
-  for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) {
-    res <- do.call(fn, list(m1, num_feats = 10))
-    expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting
-    expect_equal(sum(res$highly_variable), 10) # Only 10 features marked as highly variable
-    expect_setequal(res$names, rownames(m1))
-    res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows
-    res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) 
-    expect_identical(res_more_feats_than_rows, res_feats_equal_rows)
-    if (fn != "select_features_by_mean") {
-      # Check that normalization actually does something
-      res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL))
-      expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score))
-    }
-  }
-})
-
-
 test_that("Wilcoxon rank sum works (small)", {
     x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46)
     y <- c(1.15, 0.88, 0.90, 0.74, 1.21)

From b1ab04cd25deb157cba53e028804e16c4caf2d06 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 10 Jan 2025 02:30:56 -0800
Subject: [PATCH 046/142] [r] add iterative LSI implementation

---
 r/R/singlecell_utils.R | 107 +++++++++++++++++++++++++++++++++++++++++
 r/man/IterativeLSI.Rd  |  74 ++++++++++++++++++++++++++++
 r/pkgdown/_pkgdown.yml |   7 +--
 3 files changed, 183 insertions(+), 5 deletions(-)
 create mode 100644 r/man/IterativeLSI.Rd

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index aa87a023..4db63764 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -360,6 +360,113 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 }
 
 
+#' Run iterative LSI on a matrix.
+#' 
+#' Given a `(features x cells)` matrix, Compute an iterative LSI dimensionality reduction, using the method described in [ArchR](https://doi.org/10.1038/s41588-021-00790-6) (Granja et al; 2019).
+#' @param mat (IterableMatrix) 
+#' @param n_iterations (int) The number of LSI iterations to perform.
+#' @param first_feature_selection_method (function) Method to use for selecting features for the first iteration. Current builtin options are `select_features_by_variance`, `select_features_by_dispersion`, `select_features_by_mean`, `select_features_by_binned_dispersion`
+#' @param feature_selection_method (function) Method to use for selecting features for each iteration after the first. Current builtin options are `select_features_by_variance`, `select_features_by_dispersion`, `select_features_by_mean`, `select_features_by_binned_dispersion`
+#' @param cluster_method (function) Method to use for clustering. Current builtin options are `cluster_graph_{leiden, louvain, seurat}()`
+#' @return An object of class `c("LSI", "DimReduction")` with the following attributes:
+#' - `cell_embeddings`: The projected data
+#' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
+#' - `first_feature_selection_method`: The method used for selecting features for the first iteration
+#' - `lsi_method`: The method used for LSI
+#' - `cluster_method`: The method used for clustering
+#' - `feature_means`: The means of the features used for normalization
+#' - `iterations`: The number of iterations
+#' - `iter_info`: A tibble with the following columns:
+#'    - `iteration`: The iteration number
+#'    - `feature_names`: The names of the features used for the iteration
+#'    - `lsi_results`: The results of LSI for the iteration
+#'    - `clusters`: The clusters for the iteration.  This is blank for the first iteration
+#' @details
+#' The iterative LSI method is as follows:
+#' - First iteration:
+#'    - Select features based on the `first_feature_selection_method` argument
+#'    - Perform LSI on the selected features
+#'    - If `n_iterations` is 1, return the PCA results
+#'    - Else, cluster the LSI results using `cluster_method`
+#' - For each subsequent iteration:
+#'    - Pseudobulk the clusters and select the top features based on the variance of the pseudobulked clusters
+#'    - Perform LSI on the selected features
+#'    - If this is the final iteration, return the PCA results
+#'    - Else, cluster the LSI results using `cluster_method`
+#' @seealso `LSI()`, `top_features()`, `highly_variable_features()`
+#' @inheritParams LSI
+#' @export
+IterativeLSI <- function(
+  mat, 
+  n_iterations = 2,
+  first_feature_selection_method = select_features_by_binned_dispersion,
+  feature_selection_method = select_features_by_dispersion,
+  lsi_method = LSI,
+  cluster_method = cluster_graph_leiden,
+  verbose = FALSE, threads = 1L
+) {
+  assert_is(mat, "IterableMatrix")
+  assert_true(n_iterations > 0)
+  assert_is_wholenumber(n_iterations)
+  assert_is_wholenumber(threads)
+  
+  fitted_params = list(
+    first_feature_selection_method = first_feature_selection_method,
+    lsi_method = lsi_method,
+    cluster_method = cluster_method,
+    feature_means = matrix_stats(mat, row_stats = "mean", threads = threads)$row_stats["mean",],
+    iterations = n_iterations,
+    iter_info = tibble::tibble(
+      iteration = integer(),
+      feature_names = list(),
+      lsi_results = list(),
+      clusters = list()
+    )
+  )
+  if (verbose) log_progress("Starting Iterative LSI")
+  for (i in seq_len(n_iterations)) {
+    if (verbose) log_progress(sprintf("Starting Iterative LSI iteration %s of %s", i, n_iterations))
+    # add a blank row to the iter_info tibble
+    fitted_params$iter_info <- tibble::add_row(fitted_params$iter_info, iteration = i)
+    # run variable feature selection
+    if (verbose) log_progress("Selecting features")
+    if (i == 1) {
+      variable_features <- first_feature_selection_method(mat, threads = threads)
+    } else {
+      variable_features <- feature_selection_method(pseudobulk_res, threads = threads)
+    }
+    fitted_params$iter_info$feature_names[[i]] <- variable_features %>% dplyr::filter(highly_variable) %>% dplyr::pull(names)
+    
+    if (is.character(fitted_params$iter_info$feature_names[[i]])) {
+      mat_indices <- which(rownames(mat) %in% fitted_params$iter_info$feature_names[[i]])
+    } else {
+      mat_indices <- fitted_params$iter_info$feature_names[[i]]
+    }
+    # run LSI
+    if (verbose) log_progress("Running LSI")
+    lsi_res_obj <- lsi_method(mat[mat_indices,], threads = threads)
+    fitted_params$iter_info$lsi_results[[i]] <- lsi_res_obj$fitted_params
+    # only cluster + pseudobulk if this isn't the last iteration
+    if (i == n_iterations) break
+    # cluster the LSI results
+    if (verbose) log_progress("Clustering LSI results")
+    clustering_res <- t(lsi_res_obj$cell_embeddings) %>% knn_hnsw(ef = 500, threads = threads) %>% knn_to_snn_graph() %>% cluster_method()
+    fitted_params$iter_info$clusters[[i]] <- clustering_res
+    # pseudobulk and pass onto next iteration
+    if (verbose) log_progress("Pseudobulking matrix")
+    pseudobulk_res <- pseudobulk_matrix(mat, clustering_res, threads = as.integer(threads)) %>% as("dgCMatrix") %>% as("IterableMatrix")
+    rownames(pseudobulk_res) <- rownames(mat)
+  }
+  if (verbose) log_progress("Finished running LSI")
+  res <- DimReduction(
+    x = lsi_res_obj$cell_embeddings,
+    fitted_params = fitted_params,
+    feature_names = rownames(mat)
+  )
+  class(res) <- c("IterativeLSI", class(res))
+  return(res)
+}
+
 #' Test for marker features
 #'
 #' Given a features x cells matrix, perform one-vs-all differential
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
new file mode 100644
index 00000000..37c9cb9f
--- /dev/null
+++ b/r/man/IterativeLSI.Rd
@@ -0,0 +1,74 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{IterativeLSI}
+\alias{IterativeLSI}
+\title{Run iterative LSI on a matrix.}
+\usage{
+IterativeLSI(
+  mat,
+  n_iterations = 2,
+  first_feature_selection_method = select_features_by_binned_dispersion,
+  feature_selection_method = select_features_by_dispersion,
+  lsi_method = LSI,
+  cluster_method = cluster_graph_leiden,
+  verbose = FALSE,
+  threads = 1L
+)
+}
+\arguments{
+\item{mat}{(IterableMatrix)}
+
+\item{n_iterations}{(int) The number of LSI iterations to perform.}
+
+\item{first_feature_selection_method}{(function) Method to use for selecting features for the first iteration. Current builtin options are \code{select_features_by_variance}, \code{select_features_by_dispersion}, \code{select_features_by_mean}, \code{select_features_by_binned_dispersion}}
+
+\item{feature_selection_method}{(function) Method to use for selecting features for each iteration after the first. Current builtin options are \code{select_features_by_variance}, \code{select_features_by_dispersion}, \code{select_features_by_mean}, \code{select_features_by_binned_dispersion}}
+
+\item{cluster_method}{(function) Method to use for clustering. Current builtin options are \verb{cluster_graph_\{leiden, louvain, seurat\}()}}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+An object of class \code{c("LSI", "DimReduction")} with the following attributes:
+\itemize{
+\item \code{cell_embeddings}: The projected data
+\item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
+\item \code{first_feature_selection_method}: The method used for selecting features for the first iteration
+\item \code{lsi_method}: The method used for LSI
+\item \code{cluster_method}: The method used for clustering
+\item \code{feature_means}: The means of the features used for normalization
+\item \code{iterations}: The number of iterations
+\item \code{iter_info}: A tibble with the following columns:
+\itemize{
+\item \code{iteration}: The iteration number
+\item \code{feature_names}: The names of the features used for the iteration
+\item \code{lsi_results}: The results of LSI for the iteration
+\item \code{clusters}: The clusters for the iteration.  This is blank for the first iteration
+}
+}
+}
+\description{
+Given a \verb{(features x cells)} matrix, Compute an iterative LSI dimensionality reduction, using the method described in \href{https://doi.org/10.1038/s41588-021-00790-6}{ArchR} (Granja et al; 2019).
+}
+\details{
+The iterative LSI method is as follows:
+\itemize{
+\item First iteration:
+\itemize{
+\item Select features based on the \code{first_feature_selection_method} argument
+\item Perform LSI on the selected features
+\item If \code{n_iterations} is 1, return the PCA results
+\item Else, cluster the LSI results using \code{cluster_method}
+}
+\item For each subsequent iteration:
+\itemize{
+\item Pseudobulk the clusters and select the top features based on the variance of the pseudobulked clusters
+\item Perform LSI on the selected features
+\item If this is the final iteration, return the PCA results
+\item Else, cluster the LSI results using \code{cluster_method}
+}
+}
+}
+\seealso{
+\code{LSI()}, \code{top_features()}, \code{highly_variable_features()}
+}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 9c842b06..4472ab75 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -60,6 +60,7 @@ articles:
   - "web-only/programming-philosophy"
   - "web-only/developer-notes"
   - "web-only/manuscript-draft"
+  - "web-only/manuscript-draft"
 
 # cosmo, flatly, united, sandstone all look reasonable
 # pulse, lumen, zephyr
@@ -132,13 +133,9 @@ reference:
   - checksum
   - apply_by_row
   - regress_out
-  - normalize_log
-  - normalize_tfidf
-  - select_features_by_variance
-  - select_features_by_dispersion
   - select_features_by_binned_dispersion
-  - select_features_by_mean
   - LSI
+  - IterativeLSI
   - IterableMatrix-methods
   - pseudobulk_matrix
 

From 76f4c7d2955c24b463a58b00f2299f331441a09c Mon Sep 17 00:00:00 2001
From: Immanuel Abdi <56730419+immanuelazn@users.noreply.github.com>
Date: Fri, 10 Jan 2025 12:58:07 -0800
Subject: [PATCH 047/142] [r] change check for `pseudobulk_matrix()` to use
 whole number instead of integer (#183)

---
 r/R/singlecell_utils.R | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 34ed7dd6..b222a931 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -456,10 +456,14 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
       rlang::abort(sprintf("method must be one of: %s", paste(methods, collapse = ", ")))
     }
   }
-  assert_is(threads, "integer")
-  # if multiple methods are provided, only need to pass in the top method as it will also calculate the less complex stats
-  iter <- iterate_matrix(parallel_split(mat, threads, threads*4))
-  res <- pseudobulk_matrix_cpp(iter, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose)
+  assert_is_wholenumber(threads)
+
+  it <- mat %>%
+    convert_matrix_type("double") %>%
+    parallel_split(threads, threads*4) %>%
+    iterate_matrix()
+  
+  res <- pseudobulk_matrix_cpp(it, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose)
   # if res is a single matrix, return with colnames and rownames
   if (length(method) == 1) {
     colnames(res[[method]]) <- levels(cell_groups)

From 5e3a7fed33e786d7a550394043a615f9128da457 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 17 Jan 2025 21:34:51 -0800
Subject: [PATCH 048/142] [r] update feature selection documentation

---
 r/R/singlecell_utils.R                        | 16 ++-----
 r/man/feature_selection.Rd                    | 23 ++++++++++
 r/man/lsi.Rd                                  |  3 +-
 r/man/select_features_by_binned_dispersion.Rd | 46 -------------------
 r/pkgdown/_pkgdown.yml                        |  1 -
 5 files changed, 29 insertions(+), 60 deletions(-)
 delete mode 100644 r/man/select_features_by_binned_dispersion.Rd

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index b222a931..d2ce7bad 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -121,22 +121,14 @@ select_features_by_mean <- function(mat, num_feats = 25000, threads = 1L) {
   return(features_df)
 }
 
-
-#' Get the most variable features within a matrix.
-#' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
-#' all features will be returned.
+#' @rdname feature_selection
 #' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
 #' and if the number of features
 #' within a bin is less than 2, the dispersion is set to 1.
-#' @inheritParams select_features_by_variance
 #' @returns
-#' Return a dataframe with the following columns, sorted descending by bin-normalized dispersion:
-#'   - `names`: Feature name.
-#'   - `score`: Bin-normalized dispersion of the feature.
-#'   - `highly_variable`: Logical vector of whether the feature is highly variable.
-#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
-#' 
-#' Calculate using the following process:
+#'  - `select_features_by_binned_dispersion`: Score representing the bin normalized dispersion of each feature.
+#' @details 
+#' `select_features_by_binned_dispersion` calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
 #'  1. Calculate the dispersion of each feature (variance / mean)
 #'  2. Log normalize dispersion and mean
 #'  3. Bin the features by their means, and normalize dispersion within each bin
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index c57a8e40..007ba177 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -4,6 +4,7 @@
 \alias{select_features_by_variance}
 \alias{select_features_by_dispersion}
 \alias{select_features_by_mean}
+\alias{select_features_by_binned_dispersion}
 \title{Feature selection functions}
 \usage{
 select_features_by_variance(
@@ -21,6 +22,13 @@ select_features_by_dispersion(
 )
 
 select_features_by_mean(mat, num_feats = 25000, threads = 1L)
+
+select_features_by_binned_dispersion(
+  mat,
+  num_feats = 25000,
+  n_bins = 20,
+  threads = 1L
+)
 }
 \arguments{
 \item{mat}{(IterableMatrix) dimensions features x cells}
@@ -31,6 +39,10 @@ all features will be returned.}
 \item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
 
 \item{threads}{(integer) Number of threads to use.}
+
+\item{n_bins}{(integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin,
+and if the number of features
+within a bin is less than 2, the dispersion is set to 1.}
 }
 \value{
 Return a dataframe with the following columns, sorted descending by score:
@@ -52,6 +64,10 @@ Each different feature selection method will have a different scoring method:
 \itemize{
 \item \code{select_features_by_mean}: Score representing the mean accessibility of each feature.
 }
+
+\itemize{
+\item \code{select_features_by_binned_dispersion}: Score representing the bin normalized dispersion of each feature.
+}
 }
 \description{
 Apply a feature selection method to a \verb{(features x cells)} matrix.
@@ -75,4 +91,11 @@ Apply a feature selection method to a \verb{(features x cells)} matrix.
 \item Get the sum of each binarized feature.
 \item Find \code{num_feats} features with the highest accessibility.
 }
+
+\code{select_features_by_binned_dispersion} calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
+\enumerate{
+\item Calculate the dispersion of each feature (variance / mean)
+\item Log normalize dispersion and mean
+\item Bin the features by their means, and normalize dispersion within each bin
+}
 }
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
index 13348e23..dff5c88e 100644
--- a/r/man/lsi.Rd
+++ b/r/man/lsi.Rd
@@ -9,7 +9,8 @@ LSI(
   n_dimensions = 50L,
   corr_cutoff = 1,
   normalize = normalize_tfidf,
-  threads = 1L
+  threads = 1L,
+  verbose = FALSE
 )
 }
 \arguments{
diff --git a/r/man/select_features_by_binned_dispersion.Rd b/r/man/select_features_by_binned_dispersion.Rd
deleted file mode 100644
index b3c573ae..00000000
--- a/r/man/select_features_by_binned_dispersion.Rd
+++ /dev/null
@@ -1,46 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/singlecell_utils.R
-\name{select_features_by_binned_dispersion}
-\alias{select_features_by_binned_dispersion}
-\title{Get the most variable features within a matrix.}
-\usage{
-select_features_by_binned_dispersion(
-  mat,
-  num_feats = 25000,
-  n_bins = 20,
-  threads = 1L
-)
-}
-\arguments{
-\item{mat}{(IterableMatrix) dimensions features x cells}
-
-\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
-all features will be returned.}
-
-\item{n_bins}{(integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin,
-and if the number of features
-within a bin is less than 2, the dispersion is set to 1.}
-
-\item{threads}{(integer) Number of threads to use.}
-}
-\value{
-Return a dataframe with the following columns, sorted descending by bin-normalized dispersion:
-\itemize{
-\item \code{names}: Feature name.
-\item \code{score}: Bin-normalized dispersion of the feature.
-\item \code{highly_variable}: Logical vector of whether the feature is highly variable.
-}
-}
-\description{
-Get the most variable features within a matrix.
-}
-\details{
-The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
-
-Calculate using the following process:
-\enumerate{
-\item Calculate the dispersion of each feature (variance / mean)
-\item Log normalize dispersion and mean
-\item Bin the features by their means, and normalize dispersion within each bin
-}
-}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index b34d528c..c83d52d4 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -132,7 +132,6 @@ reference:
   - checksum
   - apply_by_row
   - regress_out
-  - select_features_by_binned_dispersion
   - LSI
   - IterableMatrix-methods
   - pseudobulk_matrix

From 16d534405b2fd0264e090c7b5beaada3b818acd8 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 24 Jan 2025 14:27:03 -0800
Subject: [PATCH 049/142] [r] reorder assertions, add new partial func system

---
 r/R/singlecell_utils.R                   | 51 +++++++++++-----
 r/R/transforms.R                         | 25 ++++----
 r/R/utils.R                              | 75 +++++++++++++++++++-----
 r/tests/testthat/test-singlecell_utils.R |  2 +-
 4 files changed, 112 insertions(+), 41 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 40babbe3..636591ad 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -38,16 +38,22 @@ select_features_by_variance <- function(
   normalize = normalize_log,
   threads = 1L
 ) {
-  if (rlang::is_missing(mat)) {
-    return(purrr::partial(select_features_by_variance, num_feats = num_feats, normalize = normalize, threads = threads))
-  }
-  assert_is(mat, "IterableMatrix")
   assert_greater_than_zero(num_feats)
   assert_is_wholenumber(num_feats)
   assert_len(num_feats, 1)
   assert_is(num_feats, "numeric")
+  if (rlang::is_missing(mat)) {
+    return(create_partial(
+      missing_args = list(
+        num_feats = missing(num_feats),
+        normalize = missing(normalize), 
+        threads = missing(threads)
+      )
+    ))
+  }
+  assert_is(mat, "IterableMatrix")
   num_feats <- min(max(num_feats, 0), nrow(mat))
-  if (!is.null(normalize)) mat <- normalize(mat, threads = threads)
+  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
   features_df <- tibble::tibble(
     names = rownames(mat),
     score = matrix_stats(mat, row_stats = "variance", threads = threads)$row_stats["variance",]
@@ -71,17 +77,22 @@ select_features_by_dispersion <- function(
   normalize = NULL,
   threads = 1L
 ) {
-  if (rlang::is_missing(mat)) {
-    return(partial_explicit(select_features_by_dispersion, num_feats = num_feats, normalize = normalize, threads = threads))
-  }
-  assert_is(mat, "IterableMatrix")
   assert_greater_than_zero(num_feats)
   assert_is_wholenumber(num_feats)
   assert_len(num_feats, 1)
   assert_is(num_feats, "numeric")
+  if (rlang::is_missing(mat)) {
+    return(create_partial(
+      missing_args = list(
+        num_feats = missing(num_feats),
+        normalize = missing(normalize), 
+        threads = missing(threads)
+      )
+    ))
+  }
   num_feats <- min(max(num_feats, 0), nrow(mat))
-
-  if (!is.null(normalize)) mat <- normalize(mat, threads = threads)
+  assert_is(mat, "IterableMatrix")
+  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
   mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads)
   features_df <- tibble::tibble(
     names = rownames(mat),
@@ -101,17 +112,25 @@ select_features_by_dispersion <- function(
 #' 1. Get the sum of each binarized feature.
 #' 2. Find `num_feats` features with the highest accessibility.
 #' @export
-select_features_by_mean <- function(mat, num_feats = 25000, threads = 1L) {
-  if (rlang::is_missing(mat)) {
-    return(partial_explicit(select_features_by_mean, num_feats = num_feats, threads = threads))
-  }
-  assert_is(mat, "IterableMatrix")
+select_features_by_mean <- function(mat, num_feats = 25000, normalize = NULL, threads = 1L) {
   assert_is_wholenumber(num_feats)
   assert_greater_than_zero(num_feats)
   assert_is(num_feats, "numeric")
+  if (rlang::is_missing(mat)) {
+    return(create_partial(
+      missing_args = list(
+        num_feats = missing(num_feats),
+        normalize = missing(normalize), 
+        threads = missing(threads)
+      )
+    ))
+  }
+  assert_is(mat, "IterableMatrix")
   num_feats <- min(max(num_feats, 0), nrow(mat))
+  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
   # get the sum of each feature, binarized
   # get the top features
+  
   features_df <- tibble::tibble(
     names = rownames(mat),
     score = matrix_stats(mat, row_stats = "nonzero", threads = threads)$row_stats["nonzero", ]
diff --git a/r/R/transforms.R b/r/R/transforms.R
index 02ab51b8..3ce8ac56 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -943,17 +943,19 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog`
 #' @export
 normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
-  # browser()
+  assert_is_numeric(scale_factor)
+  assert_greater_than_zero(scale_factor)
   if (rlang::is_missing(mat)) {
     return(
-      partial_explicit(
-        normalize_log, scale_factor = scale_factor, threads = threads
+      create_partial(
+        missing_args = list(
+          scale_factor = missing(scale_factor), 
+          threads = missing(threads)
+        )
       )
-    )
+    ) 
   }
   assert_is(mat, "IterableMatrix")
-  assert_is_numeric(scale_factor)
-  assert_greater_than_zero(scale_factor)
   read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
   mat <- mat %>% multiply_cols(1 / read_depth)
   return(log1p(mat * scale_factor))
@@ -971,16 +973,19 @@ normalize_tfidf <- function(
   mat, feature_means = NULL,
   scale_factor = 1e4, threads = 1L
 ) {
+  assert_is_wholenumber(threads)
   if (rlang::is_missing(mat)) {
     return(
-      partial_explicit(
-        normalize_tfidf, feature_means = feature_means, 
-        scale_factor = scale_factor, threads = threads
+      create_partial(
+        missing_args = list(
+          feature_means = missing(feature_means), 
+          scale_factor = missing(scale_factor), 
+          threads = missing(threads)
+      )
       )
     )
   }
   assert_is(mat, "IterableMatrix")
-  assert_is_wholenumber(threads)
   # If feature means are passed in, only need to calculate term frequency
   if (is.null(feature_means)) {
     mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
diff --git a/r/R/utils.R b/r/R/utils.R
index 784f6106..d9f7b145 100644
--- a/r/R/utils.R
+++ b/r/R/utils.R
@@ -58,18 +58,65 @@ log_progress <- function(msg, add_timestamp = TRUE){
   }
 }
 
-# Helper function to create partial explicit functions
-# This builds upon purrr::partial by allowing for nested partial calls, where each partial call
-# only does partial application of the arguments that were explicitly provided.
-partial_explicit <- function(fn, ...) {
-  args <- rlang::enquos(...)
-  evaluated_args <- purrr::map(args, rlang::eval_tidy)
-  # Fetch the default arguments from the function definition
-  default_args <- formals(fn)
-  # Keep only explicitly provided arguments that were evaluated
-  # where the values are different from the default arguments
-  explicitly_passed_args <- evaluated_args[names(evaluated_args) %in% names(default_args) & 
-                                            !purrr::map2_lgl(evaluated_args, default_args[names(evaluated_args)], identical)]
-  # Return a partially applied version of the function using evaluated arguments
-  return(purrr::partial(fn, !!!explicitly_passed_args))
+#' Helper to create partial functions
+#'
+#' Automatically creates a partial application of the caller
+#' function including all non-missing arguments. 
+#'
+#' @param missing_args (named list[bool]) Any named index with a TRUE value
+#'   will be treated as missing. Designed to be used in the caller with the
+#'   `base::missing()` function to detect unspecified arguments with default values,
+#'   or to manually specifiy other arguments that should not be specialized
+#' @return A `bpcells_partial` object (a function with some extra attributes)
+create_partial <- function(missing_args=list()) {
+  env <- rlang::caller_env()
+  fn_sym <- rlang::caller_call()[[1]]
+  fn <- rlang::caller_fn()
+
+  args <- list()
+  for (n in names(formals(fn))) {
+    if (n %in% names(missing_args) && missing_args[[n]]) next
+    if (rlang::is_missing(env[[n]])) next
+    args[[n]] <- env[[n]]
+  }
+
+  ret <- do.call(partial_apply, c(fn, args))
+  attr(ret, "body")[[1]] <- fn_sym
+  return(ret)
+}
+
+
+#' Create partial function calls
+#' 
+#' Specify some but not all arguments to a function.
+#'
+#' @param f A function
+#' @param ... Named arguments to `f`
+#' @param .overwrite (bool) If `f` is already an ouptut from
+#'   `partial_apply()`, whether parameter re-definitions should
+#'   be ignored or overwrite the existing definitions
+#' @return A `bpcells_partial` object (a function with some extra attributes)
+partial_apply <- function(f, ..., .overwrite=FALSE) {
+  args <- rlang::list2(...)
+
+  if (is(f, "bpcells_partial")) {
+    prev_args <- attr(f, "args")
+    for (a in names(prev_args)) {
+      if (!(.overwrite && a %in% names(args))) {
+        args[[a]] <- prev_args[[a]]
+      } 
+    }
+    f <- attr(f, "fn")
+    function_name <- attr(f, "body")[[1]]
+  } else {
+    function_name <- rlang::sym(rlang::caller_arg(f))
+  }
+  partial_fn <- do.call(purrr::partial, c(f, args))
+  attr(partial_fn, "body")[[1]] <- function_name
+  structure(
+    partial_fn,
+    class = c("bpcells_partial", "purrr_function_partial", "function"),
+    args = args,
+    fn = f
+  )
 }
\ No newline at end of file
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index efefc070..e9365ced 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -29,7 +29,7 @@ test_that("select_features works general case", {
       # Check that normalization actually does something
       res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL))
       # Check that we can do partial functions on normalization too
-      res_norm_partial <- do.call(fn, list(m1, num_feats = 10, normalize = purrr::partial(normalize_log(scale = 1e3, threads = 1L))))
+      res_norm_partial <- do.call(fn, list(m1, num_feats = 10, normalize = normalize_log(scale = 1e3, threads = 1L)))
       res_norm_implicit_partial <- select_features_by_variance(normalize = normalize_log(scale_factor = 1e3), num_feats = 10)(m1)
       expect_identical(res_norm_partial, res_norm_implicit_partial)
       expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score))

From 87eb430b1c51b096982d785dbc3a37cdabba516e Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 24 Jan 2025 14:45:29 -0800
Subject: [PATCH 050/142] [r] change behaviour of num_feats default args, write
 docs

---
 r/NAMESPACE                              |  4 ++--
 r/R/singlecell_utils.R                   | 19 +++++++++++-------
 r/R/utils.R                              |  4 +++-
 r/man/create_partial.Rd                  | 22 +++++++++++++++++++++
 r/man/feature_selection.Rd               | 25 +++++++++++-------------
 r/man/partial_apply.Rd                   | 24 +++++++++++++++++++++++
 r/pkgdown/_pkgdown.yml                   |  2 +-
 r/tests/testthat/test-singlecell_utils.R | 12 ++++++------
 8 files changed, 81 insertions(+), 31 deletions(-)
 create mode 100644 r/man/create_partial.Rd
 create mode 100644 r/man/partial_apply.Rd

diff --git a/r/NAMESPACE b/r/NAMESPACE
index deb92dfa..e9998384 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -110,8 +110,8 @@ export(rowVars.default)
 export(sctransform_pearson)
 export(select_cells)
 export(select_chromosomes)
-export(select_features_by_mean)
-export(select_features_by_variance)
+export(select_features_mean)
+export(select_features_variance)
 export(select_regions)
 export(set_trackplot_height)
 export(set_trackplot_label)
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 636591ad..37f61d98 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -16,7 +16,9 @@
 #' Apply a feature selection method to a `(features x cells)` matrix.
 #' @rdname feature_selection
 #' @param mat (IterableMatrix) dimensions features x cells
-#' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
+#' @param num_feats (float) Number of features to return.  If the number given is between 0 and 1, treat as a proportion of 
+#' the number of rows, rounded down.  Otherwise, treat as an absolute number.
+#' If the number is higher than the number of features in the matrix, 
 #' all features will be returned.
 #' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed.
 #' @param threads (integer) Number of threads to use.
@@ -33,9 +35,9 @@
 #'  1. Perform an optional term frequency + log normalization, for each feature.
 #'  2. Find `num_feats` features with the highest variance.
 #' @export
-select_features_by_variance <- function(
-  mat, num_feats = 25000, 
-  normalize = normalize_log,
+select_features_variance <- function(
+  mat, num_feats = 0.05, 
+  normalize = NULL,
   threads = 1L
 ) {
   assert_greater_than_zero(num_feats)
@@ -53,6 +55,7 @@ select_features_by_variance <- function(
   }
   assert_is(mat, "IterableMatrix")
   num_feats <- min(max(num_feats, 0), nrow(mat))
+  if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
   if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
   features_df <- tibble::tibble(
     names = rownames(mat),
@@ -72,8 +75,8 @@ select_features_by_variance <- function(
 #'  1. Perform an optional term frequency + log normalization, for each feature.
 #'  2. Find the dispersion (variance/mean) of each feature.
 #'  3. Find `num_feats` features with the highest dispersion.
-select_features_by_dispersion <- function(
-  mat, num_feats = 25000, 
+select_features_dispersion <- function(
+  mat, num_feats = 0.05, 
   normalize = NULL,
   threads = 1L
 ) {
@@ -91,6 +94,7 @@ select_features_by_dispersion <- function(
     ))
   }
   num_feats <- min(max(num_feats, 0), nrow(mat))
+  if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
   assert_is(mat, "IterableMatrix")
   if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
   mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads)
@@ -112,7 +116,7 @@ select_features_by_dispersion <- function(
 #' 1. Get the sum of each binarized feature.
 #' 2. Find `num_feats` features with the highest accessibility.
 #' @export
-select_features_by_mean <- function(mat, num_feats = 25000, normalize = NULL, threads = 1L) {
+select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, threads = 1L) {
   assert_is_wholenumber(num_feats)
   assert_greater_than_zero(num_feats)
   assert_is(num_feats, "numeric")
@@ -127,6 +131,7 @@ select_features_by_mean <- function(mat, num_feats = 25000, normalize = NULL, th
   }
   assert_is(mat, "IterableMatrix")
   num_feats <- min(max(num_feats, 0), nrow(mat))
+  if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
   if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
   # get the sum of each feature, binarized
   # get the top features
diff --git a/r/R/utils.R b/r/R/utils.R
index d9f7b145..d7c65358 100644
--- a/r/R/utils.R
+++ b/r/R/utils.R
@@ -68,6 +68,7 @@ log_progress <- function(msg, add_timestamp = TRUE){
 #'   `base::missing()` function to detect unspecified arguments with default values,
 #'   or to manually specifiy other arguments that should not be specialized
 #' @return A `bpcells_partial` object (a function with some extra attributes)
+#' @keywords internal
 create_partial <- function(missing_args=list()) {
   env <- rlang::caller_env()
   fn_sym <- rlang::caller_call()[[1]]
@@ -92,10 +93,11 @@ create_partial <- function(missing_args=list()) {
 #'
 #' @param f A function
 #' @param ... Named arguments to `f`
-#' @param .overwrite (bool) If `f` is already an ouptut from
+#' @param .overwrite (bool) If `f` is already an output from
 #'   `partial_apply()`, whether parameter re-definitions should
 #'   be ignored or overwrite the existing definitions
 #' @return A `bpcells_partial` object (a function with some extra attributes)
+#' @keywords internal
 partial_apply <- function(f, ..., .overwrite=FALSE) {
   args <- rlang::list2(...)
 
diff --git a/r/man/create_partial.Rd b/r/man/create_partial.Rd
new file mode 100644
index 00000000..fe92fd5a
--- /dev/null
+++ b/r/man/create_partial.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{create_partial}
+\alias{create_partial}
+\title{Helper to create partial functions}
+\usage{
+create_partial(missing_args = list())
+}
+\arguments{
+\item{missing_args}{(named list\link{bool}) Any named index with a TRUE value
+will be treated as missing. Designed to be used in the caller with the
+\code{base::missing()} function to detect unspecified arguments with default values,
+or to manually specifiy other arguments that should not be specialized}
+}
+\value{
+A \code{bpcells_partial} object (a function with some extra attributes)
+}
+\description{
+Automatically creates a partial application of the caller
+function including all non-missing arguments.
+}
+\keyword{internal}
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index c57a8e40..f83e2199 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -1,31 +1,28 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/singlecell_utils.R
-\name{select_features_by_variance}
-\alias{select_features_by_variance}
-\alias{select_features_by_dispersion}
-\alias{select_features_by_mean}
+\name{select_features_variance}
+\alias{select_features_variance}
+\alias{select_features_dispersion}
+\alias{select_features_mean}
 \title{Feature selection functions}
 \usage{
-select_features_by_variance(
-  mat,
-  num_feats = 25000,
-  normalize = normalize_log,
-  threads = 1L
-)
+select_features_variance(mat, num_feats = 0.05, normalize = NULL, threads = 1L)
 
-select_features_by_dispersion(
+select_features_dispersion(
   mat,
-  num_feats = 25000,
+  num_feats = 0.05,
   normalize = NULL,
   threads = 1L
 )
 
-select_features_by_mean(mat, num_feats = 25000, threads = 1L)
+select_features_mean(mat, num_feats = 0.05, normalize = NULL, threads = 1L)
 }
 \arguments{
 \item{mat}{(IterableMatrix) dimensions features x cells}
 
-\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
+\item{num_feats}{(float) Number of features to return.  If the number given is between 0 and 1, treat as a proportion of
+the number of rows, rounded down.  Otherwise, treat as an absolute number.
+If the number is higher than the number of features in the matrix,
 all features will be returned.}
 
 \item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
diff --git a/r/man/partial_apply.Rd b/r/man/partial_apply.Rd
new file mode 100644
index 00000000..c1ea508d
--- /dev/null
+++ b/r/man/partial_apply.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{partial_apply}
+\alias{partial_apply}
+\title{Create partial function calls}
+\usage{
+partial_apply(f, ..., .overwrite = FALSE)
+}
+\arguments{
+\item{f}{A function}
+
+\item{...}{Named arguments to \code{f}}
+
+\item{.overwrite}{(bool) If \code{f} is already an output from
+\code{partial_apply()}, whether parameter re-definitions should
+be ignored or overwrite the existing definitions}
+}
+\value{
+A \code{bpcells_partial} object (a function with some extra attributes)
+}
+\description{
+Specify some but not all arguments to a function.
+}
+\keyword{internal}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 0bf475aa..0d98b13d 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -139,7 +139,7 @@ reference:
 - subtitle: "Dimensionality reduction"
 - contents:
   - normalize_log
-  - select_features_by_variance
+  - select_features_variance
 - subtitle: "Clustering"
 - contents:
   - knn_hnsw
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index e9365ced..08e41b7c 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -15,24 +15,24 @@ generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val =
 
 test_that("select_features works general case", {
   m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix")
-  for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) {
-    res <- do.call(fn, list(m1, num_feats = 10))
+  for (fn in c("select_features_variance", "select_features_dispersion", "select_features_mean")) {
+    res <- do.call(fn, list(m1, num_feats = 5))
     expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting
-    expect_equal(sum(res$highly_variable), 10) # Only 10 features marked as highly variable
+    expect_equal(sum(res$highly_variable), 5) # Only 10 features marked as highly variable
     expect_setequal(res$names, rownames(m1))
     res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows
     res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100))
     res_feats_partial <- get(fn)(num_feats = 100)(m1)
     expect_identical(res_feats_equal_rows, res_feats_partial)
     expect_identical(res_more_feats_than_rows, res_feats_equal_rows)
-    if (fn == "select_features_by_variance") {
+    if (fn == "select_features_variance") {
       # Check that normalization actually does something
       res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL))
       # Check that we can do partial functions on normalization too
       res_norm_partial <- do.call(fn, list(m1, num_feats = 10, normalize = normalize_log(scale = 1e3, threads = 1L)))
-      res_norm_implicit_partial <- select_features_by_variance(normalize = normalize_log(scale_factor = 1e3), num_feats = 10)(m1)
+      res_norm_implicit_partial <- select_features_variance(normalize = normalize_log(scale_factor = 1e3), num_feats = 10)(m1)
       expect_identical(res_norm_partial, res_norm_implicit_partial)
-      expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score))
+      expect_true(!all((res_no_norm %>% dplyr::arrange(names))$score == (res_norm_partial %>% dplyr::arrange(names))$score))
     }
   }
 })

From 6c4285b0647a27b7c0b43a5c6fffebfd07a6e872 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 24 Jan 2025 17:46:12 -0800
Subject: [PATCH 051/142] [r] fix binned dispersion naming

---
 r/NAMESPACE                              |  2 +-
 r/R/singlecell_utils.R                   | 19 ++++++++++++-------
 r/man/feature_selection.Rd               |  7 ++++---
 r/tests/testthat/test-singlecell_utils.R |  8 ++++----
 4 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 314978df..2f78fe51 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -116,7 +116,7 @@ export(rowVars.default)
 export(sctransform_pearson)
 export(select_cells)
 export(select_chromosomes)
-export(select_features_by_binned_dispersion)
+export(select_features_binned_dispersion)
 export(select_features_mean)
 export(select_features_variance)
 export(select_regions)
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index e429a257..773c5fc6 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -150,14 +150,14 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread
 #' and if the number of features
 #' within a bin is less than 2, the dispersion is set to 1.
 #' @returns
-#'  - `select_features_by_binned_dispersion`: Score representing the bin normalized dispersion of each feature.
+#'  - `select_features_binned_dispersion`: Score representing the bin normalized dispersion of each feature.
 #' @details 
-#' `select_features_by_binned_dispersion` calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
+#' `select_features_binned_dispersion` calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
 #'  1. Calculate the dispersion of each feature (variance / mean)
 #'  2. Log normalize dispersion and mean
 #'  3. Bin the features by their means, and normalize dispersion within each bin
 #' @export
-select_features_by_binned_dispersion <- function(
+select_features_binned_dispersion <- function(
   mat, num_feats = 25000, n_bins = 20,
   threads = 1L
 ) {
@@ -289,9 +289,14 @@ LSI <- function(
 ) {
   if (rlang::is_missing(mat)) {
     return(
-      purrr::partial(
-        LSI, n_dimensions = n_dimensions, corr_cutoff = corr_cutoff, 
-        normalize = normalize, threads = threads, verbose = verbose
+      create_partial(
+        missing_args = list(
+          n_dimensions = missing(n_dimensions),
+          corr_cutoff = missing(corr_cutoff), 
+          normalize = missing(normalize), 
+          threads = missing(threads),
+          verbose = missing(verbose)
+        )
       )
     )
   }
@@ -308,7 +313,7 @@ LSI <- function(
   if (verbose) log_progress("Normalizing matrix")
   mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"), threads = threads)
   read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
-  if (!is.null(normalize)) mat <- normalize(mat, threads = threads)
+  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
   
   # Save to prevent re-calculation of queued operations
   mat <- write_matrix_dir(
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index 51756a83..c391c795 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -4,6 +4,7 @@
 \alias{select_features_variance}
 \alias{select_features_dispersion}
 \alias{select_features_mean}
+\alias{select_features_binned_dispersion}
 \title{Feature selection functions}
 \usage{
 select_features_variance(mat, num_feats = 0.05, normalize = NULL, threads = 1L)
@@ -17,7 +18,7 @@ select_features_dispersion(
 
 select_features_mean(mat, num_feats = 0.05, normalize = NULL, threads = 1L)
 
-select_features_by_binned_dispersion(
+select_features_binned_dispersion(
   mat,
   num_feats = 25000,
   n_bins = 20,
@@ -62,7 +63,7 @@ Each different feature selection method will have a different scoring method:
 }
 
 \itemize{
-\item \code{select_features_by_binned_dispersion}: Score representing the bin normalized dispersion of each feature.
+\item \code{select_features_binned_dispersion}: Score representing the bin normalized dispersion of each feature.
 }
 }
 \description{
@@ -88,7 +89,7 @@ Apply a feature selection method to a \verb{(features x cells)} matrix.
 \item Find \code{num_feats} features with the highest accessibility.
 }
 
-\code{select_features_by_binned_dispersion} calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
+\code{select_features_binned_dispersion} calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
 \enumerate{
 \item Calculate the dispersion of each feature (variance / mean)
 \item Log normalize dispersion and mean
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 441ea851..33effb58 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -197,8 +197,8 @@ test_that("Pseudobulk aggregation works with multiple return types", {
 test_that("Feature selection by bin variance works", {
     mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix")
     # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
-    res_table <- select_features_by_binned_dispersion(mat, num_feats = 10, n_bins = 5, threads = 1)
-    res_table_t <- select_features_by_binned_dispersion(t(mat), num_feats = 10, n_bins = 5, threads = 1)
+    res_table <- select_features_binned_dispersion(mat, num_feats = 10, n_bins = 5, threads = 1)
+    res_table_t <- select_features_binned_dispersion(t(mat), num_feats = 10, n_bins = 5, threads = 1)
     res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) 
     res <- mat[res_feats,]
     res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(names)
@@ -218,7 +218,7 @@ test_that("LSI works", {
     lsi_res_obj <- LSI(mat, n_dimensions = 5)
     lsi_res_t_obj <- LSI(t(mat), n_dimensions = 5)
     # Also check partial args
-    lsi_res_obj_partial <- LSI(n_dimensions = 5)(mat)
+    lsi_res_obj_partial <- LSI(n_dimensions = 5, normalize = normalize_tfidf(scale_factor = 10000, threads = 4), threads = 4)(mat)
     lsi_res <- lsi_res_obj$cell_embeddings
     lsi_res_t <- lsi_res_t_obj$cell_embeddings
     # Check that projection results in the same output if used on the same input matrix
@@ -228,7 +228,7 @@ test_that("LSI works", {
     expect_equal(ncol(lsi_res), ncol(mat))
     expect_equal(nrow(lsi_res_t), 5)
     expect_equal(ncol(lsi_res_t), nrow(mat))
-    expect_equal(lsi_res_obj, lsi_res_obj_partial)
+    expect_equal(lsi_res_obj$cell_embeddings, lsi_res_obj_partial$cell_embeddings)
     expect_equal(lsi_res, lsi_res_proj)
 })
 

From 04f67f2df4058d3ce219dfeae6e905b21f2d69c6 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 24 Jan 2025 17:56:17 -0800
Subject: [PATCH 052/142] [r] change normalize text for feature selection

---
 r/R/singlecell_utils.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 696b312e..69d304c5 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -20,7 +20,8 @@
 #' the number of rows, rounded down.  Otherwise, treat as an absolute number.
 #' If the number is higher than the number of features in the matrix, 
 #' all features will be returned.
-#' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed.
+#' @param normalize (function) Normalize matrix using a given function. Normalization occurs prior on the input mat prior to feature 
+#' selection. If `NULL`, no normalization is performed. @seealso `normalize_tfidf()` `normalize_log()`
 #' @param threads (integer) Number of threads to use.
 #' @returns
 #' Return a dataframe with the following columns, sorted descending by score:

From 4d175578974cef79db8c3533f6ebdc6cfc832340 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Sun, 26 Jan 2025 16:01:57 -0800
Subject: [PATCH 053/142] [r] add iterative LSI

---
 r/NAMESPACE                              |   3 +
 r/R/singlecell_utils.R                   | 263 ++++++++---------------
 r/man/IterativeLSI.Rd                    |  16 +-
 r/man/lsi.Rd                             |   6 +-
 r/pkgdown/_pkgdown.yml                   |   3 +-
 r/tests/testthat/test-singlecell_utils.R |  39 ++--
 6 files changed, 124 insertions(+), 206 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 2f78fe51..f51755bf 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -3,6 +3,7 @@
 S3method(base::as.data.frame,IterableFragments)
 S3method(base::as.matrix,IterableMatrix)
 S3method(project,DimReduction)
+S3method(project,IterativeLSI)
 S3method(project,LSI)
 S3method(project,default)
 S3method(svds,IterableMatrix)
@@ -11,6 +12,7 @@ export("all_matrix_inputs<-")
 export("cellNames<-")
 export("chrNames<-")
 export(DimReduction)
+export(IterativeLSI)
 export(LSI)
 export(add_cols)
 export(add_rows)
@@ -117,6 +119,7 @@ export(sctransform_pearson)
 export(select_cells)
 export(select_chromosomes)
 export(select_features_binned_dispersion)
+export(select_features_dispersion)
 export(select_features_mean)
 export(select_features_variance)
 export(select_regions)
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 69d304c5..f038eca5 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -20,21 +20,19 @@
 #' the number of rows, rounded down.  Otherwise, treat as an absolute number.
 #' If the number is higher than the number of features in the matrix, 
 #' all features will be returned.
-#' @param normalize (function) Normalize matrix using a given function. Normalization occurs prior on the input mat prior to feature 
+#' @param normalize (function) Normalize matrix using a given function. Normalization occurs on the input mat prior to feature 
 #' selection. If `NULL`, no normalization is performed. @seealso `normalize_tfidf()` `normalize_log()`
 #' @param threads (integer) Number of threads to use.
 #' @returns
-#' Return a dataframe with the following columns, sorted descending by score:
+#' Return a dataframe with the following columns:
 #' - `names`: Feature name.
 #' - `score`: Scoring of the feature, depending on the method used.  
 #' - `highly_variable`: Logical vector of whether the feature is highly variable.
 #'
-#' Each different feature selection method will have a different scoring method:
-#' - `select_features_by_variance`: Score representing variance of each feature.
-#' @details 
-#' `select_features_by_variance` Calculates the variance of each feature using the following process:
-#'  1. Perform an optional term frequency + log normalization, for each feature.
-#'  2. Find `num_feats` features with the highest variance.
+#' Each different feature selection method will have a different scoring method.
+#' For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, determine the score of
+#' each feature \eqn{x_i} as follows:
+#' - `select_features_by_variance`: \eqn{\mathrm{Score}(x_i) = \frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}
 #' @export
 select_features_variance <- function(
   mat, num_feats = 0.05, 
@@ -42,7 +40,6 @@ select_features_variance <- function(
   threads = 1L
 ) {
   assert_greater_than_zero(num_feats)
-  assert_is_wholenumber(num_feats)
   assert_len(num_feats, 1)
   assert_is(num_feats, "numeric")
   if (rlang::is_missing(mat)) {
@@ -62,27 +59,21 @@ select_features_variance <- function(
     names = rownames(mat),
     score = matrix_stats(mat, row_stats = "variance", threads = threads)$row_stats["variance",]
   ) %>% 
-    dplyr::arrange(desc(score)) %>% 
-    dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats)
+    dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats)
   return(features_df)
 }
 
 
 #' @rdname feature_selection
 #' @returns
-#'  - `select_features_by_dispersion`: Score representing the dispersion of each feature.
-#' @details 
-#' `select_features_by_dispersion` calculates the dispersion of each feature using the following process:
-#'  1. Perform an optional term frequency + log normalization, for each feature.
-#'  2. Find the dispersion (variance/mean) of each feature.
-#'  3. Find `num_feats` features with the highest dispersion.
+#'  - `select_features_by_dispersion`: \eqn{\mathrm{Score}(x_i) = \frac{\frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}{\bar{x}_i}}
+#' @export
 select_features_dispersion <- function(
   mat, num_feats = 0.05, 
   normalize = NULL,
   threads = 1L
 ) {
   assert_greater_than_zero(num_feats)
-  assert_is_wholenumber(num_feats)
   assert_len(num_feats, 1)
   assert_is(num_feats, "numeric")
   if (rlang::is_missing(mat)) {
@@ -103,22 +94,16 @@ select_features_dispersion <- function(
     names = rownames(mat),
     score = mat_stats$row_stats["variance", ] / mat_stats$row_stats["mean", ]
   ) %>% 
-    dplyr::arrange(desc(score)) %>% 
-    dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats)
+    dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats)
   return(features_df)
 }
 
 
 #' @rdname feature_selection
 #' @returns
-#' - `select_features_by_mean`: Score representing the mean accessibility of each feature.
-#' @details 
-#' `select_features_by_mean` calculates the mean accessibility of each feature using the following process:
-#' 1. Get the sum of each binarized feature.
-#' 2. Find `num_feats` features with the highest accessibility.
+#' - `select_features_by_mean`: \eqn{\mathrm{Score}(x_i) = \bar{x}_i}
 #' @export
 select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, threads = 1L) {
-  assert_is_wholenumber(num_feats)
   assert_greater_than_zero(num_feats)
   assert_is(num_feats, "numeric")
   if (rlang::is_missing(mat)) {
@@ -141,8 +126,7 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread
     names = rownames(mat),
     score = matrix_stats(mat, row_stats = "nonzero", threads = threads)$row_stats["nonzero", ]
   ) %>%
-    dplyr::arrange(desc(score)) %>%
-    dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats)
+    dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats)
   return(features_df)
 }
 
@@ -164,7 +148,6 @@ select_features_binned_dispersion <- function(
 ) {
   assert_is(mat, "IterableMatrix")
   assert_greater_than_zero(num_feats)
-  assert_is_wholenumber(num_feats)
   assert_len(num_feats, 1)
   assert_is_wholenumber(n_bins)
   assert_len(n_bins, 1)
@@ -176,7 +159,7 @@ select_features_binned_dispersion <- function(
       score = rep(0, nrow(mat)),
       highly_variable = rep(TRUE, nrow(mat))
     )
-    return(mat)
+    return(features_df)
   }
   # Calculate row information for dispersion
   mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
@@ -185,7 +168,7 @@ select_features_binned_dispersion <- function(
   # Calculate dispersion, and log normalize
   feature_dispersion <- feature_vars / feature_means
   feature_dispersion[feature_dispersion == 0] <- NA
-  feature_dispersion <- log1p(feature_dispersion)
+  feature_dispersion <- log(feature_dispersion)
   feature_dispersion[feature_means == 0] <- 0
   feature_means <- log1p(feature_means)
   features_df <- tibble::tibble(
@@ -203,8 +186,7 @@ select_features_binned_dispersion <- function(
       score = if (dplyr::n() == 1) {1} else {score} # Set feats that are in bins with only one feat to have a norm dispersion of 1  
     ) %>% 
     dplyr::ungroup() %>%
-    dplyr::arrange(desc(score)) %>%
-    dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats) %>% 
+    dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats) %>% 
     dplyr::select(c("names", "score", "highly_variable"))
   return(features_df)
 }
@@ -268,12 +250,12 @@ project.DimReduction <- function(x, mat, ...) {
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
 #' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
 #' the corr_cutoff, it will be excluded from the final PCA matrix.
-#' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed.
+#' @param scale_factor (numeric) Scale factor to use for tf-idf normalization.
 #' @param threads (integer) Number of threads to use.
 #' @returns An object of class `c("LSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data
 #' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
-#'   - `normalization_method`: The normalization method used
+#'   - `scale_factor`: The scale factor used for tf-idf normalization
 #'   - `feature_means`: The means of the features used for normalization
 #'   - `pcs_to_keep`: The PCs that were kept after filtering by correlation to sequencing depth
 #'   - `svd_params`: The matrix calculated for SVD
@@ -282,10 +264,10 @@ project.DimReduction <- function(x, mat, ...) {
 #' 
 #' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
 #' - 17.1 MB memory usage, 25.1 seconds runtime
-#' @seealso `project()` `DimReduction()` `normalize_tfidf()`
+#' @seealso `project()` `DimReduction()` `normalize_tfidf()` 
 #' @export
 LSI <- function(
-  mat, n_dimensions = 50L, corr_cutoff = 1, normalize = normalize_tfidf,
+  mat, n_dimensions = 50L, corr_cutoff = 1, scale_factor = 1e4,
   threads = 1L, verbose = FALSE
 ) {
   if (rlang::is_missing(mat)) {
@@ -294,7 +276,6 @@ LSI <- function(
         missing_args = list(
           n_dimensions = missing(n_dimensions),
           corr_cutoff = missing(corr_cutoff), 
-          normalize = missing(normalize), 
           threads = missing(threads),
           verbose = missing(verbose)
         )
@@ -314,8 +295,12 @@ LSI <- function(
   if (verbose) log_progress("Normalizing matrix")
   mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"), threads = threads)
   read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
-  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
-  
+  mat <- partial_apply(
+    normalize_tfidf,
+    feature_means = mat_stats$row_stats["mean", ],
+    scale_factor = scale_factor,
+    threads = threads
+  )(mat)
   # Save to prevent re-calculation of queued operations
   mat <- write_matrix_dir(
     convert_matrix_type(mat, type = "float"),
@@ -334,7 +319,7 @@ LSI <- function(
     pca_res <- pca_res[pca_feats_to_keep, ]
   }
   fitted_params <- list(
-    normalization_method = normalize,
+    scale_factor = scale_factor,
     feature_means = mat_stats$row_stats["mean", ],
     pcs_to_keep = pca_feats_to_keep,
     svd_params = svd_attr
@@ -348,7 +333,6 @@ LSI <- function(
   return(res)
 }
 
-
 #' @export
 project.LSI <- function(x, mat, threads = 1L, ...) {
   assert_is(mat, "IterableMatrix")
@@ -361,18 +345,15 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
     assert_true(all(x$feature_names %in% rownames(mat)))
     mat <- mat[x$feature_names, ]
   }
-
-  if (!is.null(fitted_params$normalization_method))  {
-    mat <- fitted_params$normalization_method(
-      mat,
-      feature_means = fitted_params$feature_means,
-      threads = threads
-    )
-    mat <- write_matrix_dir(
-      convert_matrix_type(mat, type = "float"),
-      tempfile("mat"), compress = TRUE
-    )
-  }
+  mat <- partial_apply(
+    normalize_tfidf,
+    feature_means = fitted_params$feature_means,
+    threads = threads
+  )(mat)
+  mat <- write_matrix_dir(
+    convert_matrix_type(mat, type = "float"),
+    tempfile("mat"), compress = TRUE
+  )
   pca_attr <- fitted_params$svd_params
   res <- t(pca_attr$u) %*% mat
   if (length(fitted_params$pcs_to_keep) != nrow(res)) {
@@ -387,10 +368,11 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' Given a `(features x cells)` matrix, Compute an iterative LSI dimensionality reduction, using the method described in [ArchR](https://doi.org/10.1038/s41588-021-00790-6) (Granja et al; 2019).
 #' @param mat (IterableMatrix) 
 #' @param n_iterations (int) The number of LSI iterations to perform.
-#' @param first_feature_selection_method (function) Method to use for selecting features for the first iteration. Current builtin options are `select_features_by_variance`, `select_features_by_dispersion`, `select_features_by_mean`, `select_features_by_binned_dispersion`
-#' @param feature_selection_method (function) Method to use for selecting features for each iteration after the first. Current builtin options are `select_features_by_variance`, `select_features_by_dispersion`, `select_features_by_mean`, `select_features_by_binned_dispersion`
+#' @param first_feature_selection_method (function) Method to use for selecting features for the first iteration. Current builtin options are `select_features_by_variance`, `select_features_by_dispersion`, `select_features_by_mean`, `select_features_binned_dispersion`
+#' @param feature_selection_method (function) Method to use for selecting features for each iteration after the first. Current builtin options are `select_features_by_variance`, `select_features_by_dispersion`, `select_features_by_mean`, `select_features_binned_dispersion`
 #' @param cluster_method (function) Method to use for clustering. Current builtin options are `cluster_graph_{leiden, louvain, seurat}()`
-#' @return An object of class `c("LSI", "DimReduction")` with the following attributes:
+#' @param lsi_method (function) Method to use for LSI.  Only `LSI` is allowed.  The user can pass in partial parameters to `LSI` to customize the LSI method, such as by passing `LSI(n_dimensions = 30, corr_cutoff = 0.5)`.
+#' @return An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data
 #' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 #' - `first_feature_selection_method`: The method used for selecting features for the first iteration
@@ -401,7 +383,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' - `iter_info`: A tibble with the following columns:
 #'    - `iteration`: The iteration number
 #'    - `feature_names`: The names of the features used for the iteration
-#'    - `lsi_results`: The results of LSI for the iteration
+#'    - `lsi_results`: The results of LSI for the iteration.  This follows the same structure as the `fitted_params` attribute of the `LSI` object, but information such as the `v` and `d` matrices are removed.
 #'    - `clusters`: The clusters for the iteration.  This is blank for the first iteration
 #' @details
 #' The iterative LSI method is as follows:
@@ -415,15 +397,15 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #'    - Perform LSI on the selected features
 #'    - If this is the final iteration, return the PCA results
 #'    - Else, cluster the LSI results using `cluster_method`
-#' @seealso `LSI()`, `top_features()`, `highly_variable_features()`
+#' @seealso `LSI()`, `feature_selection`, `DimReduction()`
 #' @inheritParams LSI
 #' @export
 IterativeLSI <- function(
   mat, 
   n_iterations = 2,
-  first_feature_selection_method = select_features_by_binned_dispersion,
-  feature_selection_method = select_features_by_dispersion,
-  lsi_method = LSI,
+  first_feature_selection_method = select_features_binned_dispersion,
+  feature_selection_method = select_features_dispersion,
+  lsi_method = LSI, # Make only allowed to be LSI
   cluster_method = cluster_graph_leiden,
   verbose = FALSE, threads = 1L
 ) {
@@ -468,8 +450,11 @@ IterativeLSI <- function(
     if (verbose) log_progress("Running LSI")
     lsi_res_obj <- lsi_method(mat[mat_indices,], threads = threads)
     fitted_params$iter_info$lsi_results[[i]] <- lsi_res_obj$fitted_params
+    # remove the feature means from the lsi results as they are already calculated
+    # save minimum info for lsi results if not onn terminal iteration
+    fitted_params$iter_info$lsi_results[[i]]$feature_means <- NULL
     # only cluster + pseudobulk if this isn't the last iteration
-    if (i == n_iterations) break
+    if (i == n_iterations) break 
     # cluster the LSI results
     if (verbose) log_progress("Clustering LSI results")
     clustering_res <- t(lsi_res_obj$cell_embeddings) %>% knn_hnsw(ef = 500, threads = threads) %>% knn_to_snn_graph() %>% cluster_method()
@@ -477,6 +462,10 @@ IterativeLSI <- function(
     # pseudobulk and pass onto next iteration
     if (verbose) log_progress("Pseudobulking matrix")
     pseudobulk_res <- pseudobulk_matrix(mat, clustering_res, threads = as.integer(threads)) %>% as("dgCMatrix") %>% as("IterableMatrix")
+    # Only take the SVD information required to project the matrix
+    fitted_params$iter_info$lsi_results[[i]]$svd_params <- list(
+      u = lsi_res_obj$fitted_params$svd_params$u
+    )
     rownames(pseudobulk_res) <- rownames(mat)
   }
   if (verbose) log_progress("Finished running LSI")
@@ -488,6 +477,48 @@ IterativeLSI <- function(
   class(res) <- c("IterativeLSI", class(res))
   return(res)
 }
+#' @export
+project.IterativeLSI <- function(x, mat, threads = 1L, ...) {
+  assert_is(mat, "IterableMatrix")
+  assert_is(x, "IterativeLSI")
+  fitted_params <- x$fitted_params
+  # Get the final row of fitted params
+  last_iter_info <- fitted_params$iter_info[nrow(fitted_params$iter_info),]
+
+  # Do a check to make sure that the fitted features all exist in input matrix
+  if (!is.null(rownames(mat)) && !is.null(x$feature_names)) {
+    assert_true(all(x$feature_names %in% rownames(mat)))
+  }
+  # Subset to variable features
+  if (is.character(last_iter_info$feature_names[[1]])) {
+    mat_indices <- which(rownames(mat) %in% last_iter_info$feature_names[[1]])
+  } else {
+    mat_indices <- last_iter_info$feature_names[[1]]
+  }
+  mat <- mat[mat_indices,]
+  # Run LSI
+  # since we don't hold the LSI object, we copy the internal logic from `project.LSI()`
+  lsi_attr <- attr(x$fitted_params$lsi_method, "args")
+  
+  func <- partial_apply(
+    normalize_tfidf,
+    feature_means = fitted_params$feature_means,
+    scale_factor = lsi_attr$scale_factor,
+    threads = threads
+  )
+  mat <- func(mat)
+  mat <- write_matrix_dir(
+    convert_matrix_type(mat, type = "float"),
+    tempfile("mat"), compress = TRUE
+  )
+  
+  pca_attr <- last_iter_info$lsi_results[[1]]$svd_params
+  res <- t(pca_attr$u) %*% mat
+  if (length(last_iter_info$lsi_results[[1]]$pcs_to_keep) != nrow(res)) {
+    res <- res[last_iter_info$lsi_results[[1]]$pcs_to_keep,]
+  }
+  return(res)
+}
 
 #' Test for marker features
 #'
@@ -610,110 +641,4 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 0L) {
     }
   }
   return(res)
-}
-
-#' Perform latent semantic indexing (LSI) on a matrix.
-#' @param mat (IterableMatrix) dimensions features x cells
-#' @param n_dimensions (integer) Number of dimensions to keep during PCA.
-#' @param scale_factor (integer) Scale factor for the tf-idf log transform.
-#' @param save_in_memory (logical) If TRUE, save the log(tf-idf) matrix in memory.  
-#' If FALSE, save to a temporary location in disk.  Saving in memory will result in faster downstream operations,
-#' but will require in higher memory usage.  Comparison of memory usage and speed is in the details section.
-#' @param threads (integer) Number of threads to use.
-#' @return dgCMatrix of shape (n_dimensions, ncol(mat)).
-#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
-#' 
-#' ** Saving in memory vs disk: **
-#' Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix.
-#' This is done to prevent re-calculation of queued operations during PCA optimization.
-#' 
-#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
-#' - Saving in memory: 233 MB memory usage, 22.7 seconds runtime
-#' - Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime
-#' 
-#' @export
-lsi <- function(mat, n_dimensions = 50L, scale_factor = 1e4, save_in_memory = FALSE, threads = 1L) {
-  assert_is(mat, "IterableMatrix")
-  assert_is_wholenumber(n_dimensions)
-  assert_len(n_dimensions, 1)
-  assert_greater_than_zero(n_dimensions)
-  assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
-  assert_is_wholenumber(threads)
-
-  # log(tf-idf) transform
-  npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h`
-  tf <- mat %>% multiply_cols(1 / npeaks)
-  idf_ <- ncol(mat) / rowSums(mat)
-  mat_tfidf <- tf %>% multiply_rows(idf_)
-  mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
-  # Save to prevent re-calculation of queued operations
-  if (save_in_memory) {
-    mat_log_tfidf <- write_matrix_memory(mat_log_tfidf, compress = FALSE)
-  } else {
-    mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE)
-  } 
-  # Z-score normalization
-  cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance", threads = threads)$col_stats
-  cell_means <- cell_peak_stats["mean",]
-  cell_vars <- cell_peak_stats["variance",]
-  mat_lsi_norm <- mat_log_tfidf %>%
-    add_cols(-cell_means) %>%
-    multiply_cols(1 / cell_vars)
-  # Run pca
-  svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions, threads = threads)
-  pca_res <- t(svd_attr_$u) %*% mat_lsi_norm
-  return(pca_res)
-}
-
-#' Get the most variable features within a matrix
-#' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
-#' ll features will be returned.
-#' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
-#' and if the number of features
-#' within a bin is less than 2, the dispersion is set to 1.
-#' @returns IterableMatrix subset of the most variable features.
-#' @inheritParams lsi
-#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
-#' 
-#' Calculate using the following process:
-#'  1. Calculate the dispersion of each feature (variance / mean)
-#'  2. Log normalize dispersion and mean
-#'  3. Bin the features by their means, and normalize dispersion within each bin
-#' @export
-highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) {
-  assert_is(mat, "IterableMatrix")
-  assert_greater_than_zero(num_feats)
-  assert_is_wholenumber(num_feats)
-  assert_len(num_feats, 1)
-  assert_is_wholenumber(n_bins)
-  assert_len(n_bins, 1)
-  assert_greater_than_zero(n_bins)
-  if (nrow(mat) <= num_feats) {
-    log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
-    return(mat)
-  }
-  
-  feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats["mean", ]
-  feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats["variance", ]
-  feature_means[feature_means == 0] <- 1e-12
-  feature_dispersion <- feature_vars / feature_means
-  feature_dispersion[feature_dispersion == 0] <- NA
-  feature_dispersion <- log(feature_dispersion)
-  feature_means <- log1p(feature_means)
-  mean_bins <- cut(feature_means, n_bins, labels = FALSE)
-  
-  bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x, na.rm = TRUE))
-  bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x, na.rm = TRUE))
-  # Set feats that are in bins with only one feat to have a norm dispersion of 1
-  one_gene_bin <- is.na(bin_sd)
-  bin_sd[one_gene_bin] <- bin_mean[one_gene_bin]
-  bin_mean[one_gene_bin] <- 0
-  # map mean_bins indices to bin_stats
-  # Do a character search as bins without features mess up numeric indexing
-  feature_dispersion_norm <- (feature_dispersion - bin_mean[as.character(mean_bins)]) / bin_sd[as.character(mean_bins)]
-  names(feature_dispersion_norm) <- names(feature_dispersion)
-  feature_dispersion_norm <- sort(feature_dispersion_norm) # sorting automatically removes NA values
-  if (length(feature_dispersion_norm) < num_feats) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all non-zero features", length(feature_dispersion_norm), num_feats))
-  variable_features_ <- feature_dispersion_norm[max(1, (length(feature_dispersion_norm) - num_feats + 1)):length(feature_dispersion_norm)]
-  return(mat[names(variable_features_), ])
 }
\ No newline at end of file
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index 37c9cb9f..5208c78d 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -7,8 +7,8 @@
 IterativeLSI(
   mat,
   n_iterations = 2,
-  first_feature_selection_method = select_features_by_binned_dispersion,
-  feature_selection_method = select_features_by_dispersion,
+  first_feature_selection_method = select_features_binned_dispersion,
+  feature_selection_method = select_features_dispersion,
   lsi_method = LSI,
   cluster_method = cluster_graph_leiden,
   verbose = FALSE,
@@ -20,16 +20,18 @@ IterativeLSI(
 
 \item{n_iterations}{(int) The number of LSI iterations to perform.}
 
-\item{first_feature_selection_method}{(function) Method to use for selecting features for the first iteration. Current builtin options are \code{select_features_by_variance}, \code{select_features_by_dispersion}, \code{select_features_by_mean}, \code{select_features_by_binned_dispersion}}
+\item{first_feature_selection_method}{(function) Method to use for selecting features for the first iteration. Current builtin options are \code{select_features_by_variance}, \code{select_features_by_dispersion}, \code{select_features_by_mean}, \code{select_features_binned_dispersion}}
 
-\item{feature_selection_method}{(function) Method to use for selecting features for each iteration after the first. Current builtin options are \code{select_features_by_variance}, \code{select_features_by_dispersion}, \code{select_features_by_mean}, \code{select_features_by_binned_dispersion}}
+\item{feature_selection_method}{(function) Method to use for selecting features for each iteration after the first. Current builtin options are \code{select_features_by_variance}, \code{select_features_by_dispersion}, \code{select_features_by_mean}, \code{select_features_binned_dispersion}}
+
+\item{lsi_method}{(function) Method to use for LSI.  Only \code{LSI} is allowed.  The user can pass in partial parameters to \code{LSI} to customize the LSI method, such as by passing \code{LSI(n_dimensions = 30, corr_cutoff = 0.5)}.}
 
 \item{cluster_method}{(function) Method to use for clustering. Current builtin options are \verb{cluster_graph_\{leiden, louvain, seurat\}()}}
 
 \item{threads}{(integer) Number of threads to use.}
 }
 \value{
-An object of class \code{c("LSI", "DimReduction")} with the following attributes:
+An object of class \code{c("IterativeLSI", "DimReduction")} with the following attributes:
 \itemize{
 \item \code{cell_embeddings}: The projected data
 \item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
@@ -42,7 +44,7 @@ An object of class \code{c("LSI", "DimReduction")} with the following attributes
 \itemize{
 \item \code{iteration}: The iteration number
 \item \code{feature_names}: The names of the features used for the iteration
-\item \code{lsi_results}: The results of LSI for the iteration
+\item \code{lsi_results}: The results of LSI for the iteration.  This follows the same structure as the \code{fitted_params} attribute of the \code{LSI} object, but information such as the \code{v} and \code{d} matrices are removed.
 \item \code{clusters}: The clusters for the iteration.  This is blank for the first iteration
 }
 }
@@ -70,5 +72,5 @@ The iterative LSI method is as follows:
 }
 }
 \seealso{
-\code{LSI()}, \code{top_features()}, \code{highly_variable_features()}
+\code{LSI()}, \code{feature_selection}, \code{DimReduction()}
 }
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
index dff5c88e..2b544b8d 100644
--- a/r/man/lsi.Rd
+++ b/r/man/lsi.Rd
@@ -8,7 +8,7 @@ LSI(
   mat,
   n_dimensions = 50L,
   corr_cutoff = 1,
-  normalize = normalize_tfidf,
+  scale_factor = 10000,
   threads = 1L,
   verbose = FALSE
 )
@@ -21,7 +21,7 @@ LSI(
 \item{corr_cutoff}{(numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
 the corr_cutoff, it will be excluded from the final PCA matrix.}
 
-\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
+\item{scale_factor}{(numeric) Scale factor to use for tf-idf normalization.}
 
 \item{threads}{(integer) Number of threads to use.}
 }
@@ -31,7 +31,7 @@ An object of class \code{c("LSI", "DimReduction")} with the following attributes
 \item \code{cell_embeddings}: The projected data
 \item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 \itemize{
-\item \code{normalization_method}: The normalization method used
+\item \code{scale_factor}: The scale factor used for tf-idf normalization
 \item \code{feature_means}: The means of the features used for normalization
 \item \code{pcs_to_keep}: The PCs that were kept after filtering by correlation to sequencing depth
 \item \code{svd_params}: The matrix calculated for SVD
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 7a5a3d13..d3d9a0c1 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -132,7 +132,6 @@ reference:
   - checksum
   - apply_by_row
   - regress_out
-  - LSI
   - IterableMatrix-methods
   - pseudobulk_matrix
 
@@ -152,6 +151,8 @@ reference:
 - contents: 
   - DimReduction
   - project
+  - LSI
+  - IterativeLSI
 
 
 - title: "Plots"
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index fed61182..3151624f 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -221,8 +221,6 @@ test_that("LSI works", {
     # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to ArchR
     lsi_res_obj <- LSI(mat, n_dimensions = 5)
     lsi_res_t_obj <- LSI(t(mat), n_dimensions = 5)
-    # Also check partial args
-    lsi_res_obj_partial <- LSI(n_dimensions = 5, normalize = normalize_tfidf(scale_factor = 10000, threads = 4), threads = 4)(mat)
     lsi_res <- lsi_res_obj$cell_embeddings
     lsi_res_t <- lsi_res_t_obj$cell_embeddings
     # Check that projection results in the same output if used on the same input matrix
@@ -232,17 +230,15 @@ test_that("LSI works", {
     expect_equal(ncol(lsi_res), ncol(mat))
     expect_equal(nrow(lsi_res_t), 5)
     expect_equal(ncol(lsi_res_t), nrow(mat))
-    expect_equal(lsi_res_obj$cell_embeddings, lsi_res_obj_partial$cell_embeddings)
     expect_equal(lsi_res, lsi_res_proj)
 })
 
 
-
 test_that("Feature selection by bin variance works", {
     mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix")
     # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
-    res_table <- select_features_by_binned_dispersion(mat, num_feats = 10, n_bins = 5, threads = 1)
-    res_table_t <- select_features_by_binned_dispersion(t(mat), num_feats = 10, n_bins = 5, threads = 1)
+    res_table <- select_features_binned_dispersion(mat, num_feats = 10, n_bins = 5, threads = 1)
+    res_table_t <- select_features_binned_dispersion(t(mat), num_feats = 10, n_bins = 5, threads = 1)
     res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) 
     res <- mat[res_feats,]
     res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(names)
@@ -254,24 +250,15 @@ test_that("Feature selection by bin variance works", {
     expect_equal(ncol(res_t), 500)
 })
 
-test_that("LSI works", {
-    mat <- matrix(runif(240), nrow=10) %>% as("dgCMatrix") %>% as("IterableMatrix")
-    rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
-    colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
-    # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to ArchR
-    lsi_res_obj <- LSI(mat, n_dimensions = 5)
-    lsi_res_t_obj <- LSI(t(mat), n_dimensions = 5)
-    # Also check partial args
-    lsi_res_obj_partial <- LSI(n_dimensions = 5)(mat)
-    lsi_res <- lsi_res_obj$cell_embeddings
-    lsi_res_t <- lsi_res_t_obj$cell_embeddings
-    # Check that projection results in the same output if used on the same input matrix
-    lsi_res_proj <- project(lsi_res_obj, mat)
 
-    expect_equal(nrow(lsi_res), 5)
-    expect_equal(ncol(lsi_res), ncol(mat))
-    expect_equal(nrow(lsi_res_t), 5)
-    expect_equal(ncol(lsi_res_t), nrow(mat))
-    expect_equal(lsi_res_obj, lsi_res_obj_partial)
-    expect_equal(lsi_res, lsi_res_proj)
-})
+test_that("Iterative LSI works", {
+  mat <- matrix(data = runif(50000, 0, 1), nrow=500, ncol = 100) %>% as("dgCMatrix") %>% as("IterableMatrix")
+  rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
+  colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
+  lsi_res_obj <- IterativeLSI(mat, n_iterations = 2, lsi_method = LSI(n_dimensions = 10))
+  lsi_res_proj <- project(lsi_res_obj, mat)
+  lsi_res_embedding <- lsi_res_obj$cell_embeddings
+  expect_equal(ncol(lsi_res_embedding), ncol(mat))
+  expect_equal(nrow(lsi_res_embedding), 10)
+  expect_equal(lsi_res_embedding, lsi_res_proj)
+})
\ No newline at end of file

From 5289f3905a31d52e97982a1651d6b777a8246f2d Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Sun, 26 Jan 2025 16:09:59 -0800
Subject: [PATCH 054/142] [r] add rcpphnsw to imports

---
 r/DESCRIPTION | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index 17faadb9..7843c25d 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -43,7 +43,8 @@ Imports:
     scattermore,
     ggrepel,
     RColorBrewer,
-    hexbin
+    hexbin,
+    RcppHNSW
 Suggests:
     IRanges,
     GenomicRanges,

From 19e96d3b3a1bd61683c61facd71e5453a22957e2 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Sun, 26 Jan 2025 16:29:48 -0800
Subject: [PATCH 055/142] [r] fix num_feats logic in feature selection

---
 r/R/singlecell_utils.R     | 45 +++++++++++++++++++++-----------------
 r/man/feature_selection.Rd | 34 ++++++++--------------------
 2 files changed, 34 insertions(+), 45 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index f038eca5..0f91898c 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -37,7 +37,8 @@
 select_features_variance <- function(
   mat, num_feats = 0.05, 
   normalize = NULL,
-  threads = 1L
+  threads = 1L,
+  verbose = FALSE
 ) {
   assert_greater_than_zero(num_feats)
   assert_len(num_feats, 1)
@@ -52,8 +53,12 @@ select_features_variance <- function(
     ))
   }
   assert_is(mat, "IterableMatrix")
-  num_feats <- min(max(num_feats, 0), nrow(mat))
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
+  if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
+    if (verbose) log_progress(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat)))
+    num_feats <- min(max(num_feats, 0), nrow(mat))
+  }
+  num_feats <- min(max(num_feats, 0), nrow(mat))
   if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
   features_df <- tibble::tibble(
     names = rownames(mat),
@@ -71,7 +76,8 @@ select_features_variance <- function(
 select_features_dispersion <- function(
   mat, num_feats = 0.05, 
   normalize = NULL,
-  threads = 1L
+  threads = 1L,
+  verbose = FALSE
 ) {
   assert_greater_than_zero(num_feats)
   assert_len(num_feats, 1)
@@ -85,8 +91,11 @@ select_features_dispersion <- function(
       )
     ))
   }
-  num_feats <- min(max(num_feats, 0), nrow(mat))
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
+  if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
+    if (verbose) log_progress(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat)))
+    num_feats <- min(max(num_feats, 0), nrow(mat))
+  }
   assert_is(mat, "IterableMatrix")
   if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
   mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads)
@@ -103,7 +112,7 @@ select_features_dispersion <- function(
 #' @returns
 #' - `select_features_by_mean`: \eqn{\mathrm{Score}(x_i) = \bar{x}_i}
 #' @export
-select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, threads = 1L) {
+select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, threads = 1L, verbose = FALSE) {
   assert_greater_than_zero(num_feats)
   assert_is(num_feats, "numeric")
   if (rlang::is_missing(mat)) {
@@ -116,12 +125,13 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread
     ))
   }
   assert_is(mat, "IterableMatrix")
-  num_feats <- min(max(num_feats, 0), nrow(mat))
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
+  if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
+    if (verbose) log_progress(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat)))
+    num_feats <- min(max(num_feats, 0), nrow(mat))
+  }
   if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
   # get the sum of each feature, binarized
-  # get the top features
-  
   features_df <- tibble::tibble(
     names = rownames(mat),
     score = matrix_stats(mat, row_stats = "nonzero", threads = threads)$row_stats["nonzero", ]
@@ -132,10 +142,9 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread
 
 #' @rdname feature_selection
 #' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
-#' and if the number of features
-#' within a bin is less than 2, the dispersion is set to 1.
+#' and if the number of features within a bin is less than 2, the dispersion is set to 1.
 #' @returns
-#'  - `select_features_binned_dispersion`: Score representing the bin normalized dispersion of each feature.
+#'  - `select_features_binned_dispersion`: Process described in `details`.
 #' @details 
 #' `select_features_binned_dispersion` calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
 #'  1. Calculate the dispersion of each feature (variance / mean)
@@ -144,7 +153,7 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread
 #' @export
 select_features_binned_dispersion <- function(
   mat, num_feats = 25000, n_bins = 20,
-  threads = 1L
+  threads = 1L, verbose = FALSE
 ) {
   assert_is(mat, "IterableMatrix")
   assert_greater_than_zero(num_feats)
@@ -152,14 +161,10 @@ select_features_binned_dispersion <- function(
   assert_is_wholenumber(n_bins)
   assert_len(n_bins, 1)
   assert_greater_than_zero(n_bins)
-  if (nrow(mat) <= num_feats) {
-    log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
-    features_df <- tibble::tibble(
-      names = rownames(mat),
-      score = rep(0, nrow(mat)),
-      highly_variable = rep(TRUE, nrow(mat))
-    )
-    return(features_df)
+  if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
+  if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
+    if (verbose) log_progress(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat)))
+    num_feats <- min(max(num_feats, 0), nrow(mat))
   }
   # Calculate row information for dispersion
   mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index c391c795..219bf91b 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -33,7 +33,8 @@ the number of rows, rounded down.  Otherwise, treat as an absolute number.
 If the number is higher than the number of features in the matrix,
 all features will be returned.}
 
-\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
+\item{normalize}{(function) Normalize matrix using a given function. Normalization occurs on the input mat prior to feature
+selection. If \code{NULL}, no normalization is performed. @seealso \code{normalize_tfidf()} \code{normalize_log()}}
 
 \item{threads}{(integer) Number of threads to use.}
 
@@ -42,24 +43,26 @@ and if the number of features
 within a bin is less than 2, the dispersion is set to 1.}
 }
 \value{
-Return a dataframe with the following columns, sorted descending by score:
+Return a dataframe with the following columns:
 \itemize{
 \item \code{names}: Feature name.
 \item \code{score}: Scoring of the feature, depending on the method used.
 \item \code{highly_variable}: Logical vector of whether the feature is highly variable.
 }
 
-Each different feature selection method will have a different scoring method:
+Each different feature selection method will have a different scoring method.
+For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, determine the score of
+each feature \eqn{x_i} as follows:
 \itemize{
-\item \code{select_features_by_variance}: Score representing variance of each feature.
+\item \code{select_features_by_variance}: \eqn{\mathrm{Score}(x_i) = \frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}
 }
 
 \itemize{
-\item \code{select_features_by_dispersion}: Score representing the dispersion of each feature.
+\item \code{select_features_by_dispersion}: \eqn{\mathrm{Score}(x_i) = \frac{\frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}{\bar{x}_i}}
 }
 
 \itemize{
-\item \code{select_features_by_mean}: Score representing the mean accessibility of each feature.
+\item \code{select_features_by_mean}: \eqn{\mathrm{Score}(x_i) = \bar{x}_i}
 }
 
 \itemize{
@@ -70,25 +73,6 @@ Each different feature selection method will have a different scoring method:
 Apply a feature selection method to a \verb{(features x cells)} matrix.
 }
 \details{
-\code{select_features_by_variance} Calculates the variance of each feature using the following process:
-\enumerate{
-\item Perform an optional term frequency + log normalization, for each feature.
-\item Find \code{num_feats} features with the highest variance.
-}
-
-\code{select_features_by_dispersion} calculates the dispersion of each feature using the following process:
-\enumerate{
-\item Perform an optional term frequency + log normalization, for each feature.
-\item Find the dispersion (variance/mean) of each feature.
-\item Find \code{num_feats} features with the highest dispersion.
-}
-
-\code{select_features_by_mean} calculates the mean accessibility of each feature using the following process:
-\enumerate{
-\item Get the sum of each binarized feature.
-\item Find \code{num_feats} features with the highest accessibility.
-}
-
 \code{select_features_binned_dispersion} calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
 \enumerate{
 \item Calculate the dispersion of each feature (variance / mean)

From eefb33d4c866ed2b376feb14a2ab7496a34ad1b8 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Sun, 26 Jan 2025 16:30:11 -0800
Subject: [PATCH 056/142] [r] add blurb about partials in normalize

---
 r/R/transforms.R   | 5 ++++-
 r/man/normalize.Rd | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/r/R/transforms.R b/r/R/transforms.R
index 3ce8ac56..0004126c 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -940,7 +940,10 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #'  transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as:
 #'  
 #'   - `normalize_log`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
-#' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog`
+#' @details 
+#' If the `mat` argument is missing, returns a "partial" function: a copy of the original function but with most arguments pre-defined. 
+#' This can be used to customize `normalize` parameters in other single cell functions in BPCells (e.g. `select_features_mean()`)
+#' - `normalize_log`: Corresponds to `Seurat::NormalizeLog`
 #' @export
 normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
   assert_is_numeric(scale_factor)
diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd
index ac53f2c0..5821a2c8 100644
--- a/r/man/normalize.Rd
+++ b/r/man/normalize.Rd
@@ -35,6 +35,8 @@ transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as:
 Apply standard normalizations to a \verb{(features x cells)} counts matrix.
 }
 \details{
+If the \code{mat} argument is missing, returns a "partial" function: a copy of the original function but with most arguments pre-defined.
+This can be used to customize \code{normalize} parameters in other single cell functions in BPCells (e.g. \code{select_features_mean()})
 \itemize{
 \item \code{normalize_log}: Corresponds to \code{Seurat::NormalizeLog}
 }

From e010069f2f1c14cfae7098b832e18726b15a6d35 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Sun, 26 Jan 2025 16:33:34 -0800
Subject: [PATCH 057/142] [r] update NEWS

---
 r/NEWS.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/r/NEWS.md b/r/NEWS.md
index deefee46..30b316eb 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -40,10 +40,9 @@ of the new features this release and will continue to help with maintenance and
 - Add `rowQuantiles()` and `colQuantiles()` functions, which return the quantiles of each row/column of a matrix. Currently `rowQuantiles()` only works on row-major matrices and `colQuantiles()` only works on col-major matrices.
   If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128)
 - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
-- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168)
-- Add feature selection functions `select_features_by_{variance,dispersion,mean}()`, with parameterization for normalization steps, and number of variable features (pull request #169)
-- Add `lsi()` function to perform latent semantic indexing on a matrix (pull request #181).
-- Add `highly_variable_features()` function to identify highly variable features in a matrix (pull request #181). 
+- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #189)
+- Add feature selection functions `select_features_by_{variance,dispersion,mean,binned_dispersion}()`, with parameterization for normalization steps, and number of variable features (pull request #189)
+- Add `LSI()` and `IterativeLSI()` dimensionality functions to perform latent semantic indexing on a matrix (pull request #189).
 
 ## Improvements
 - `trackplot_loop()` now accepts discrete color scales

From 12650882893d5c25f305b75d9b9d3e1b92c73299 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Sun, 26 Jan 2025 16:33:44 -0800
Subject: [PATCH 058/142] [r] update feature selection docs

---
 r/man/feature_selection.Rd | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index 219bf91b..02dd31b8 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -7,22 +7,36 @@
 \alias{select_features_binned_dispersion}
 \title{Feature selection functions}
 \usage{
-select_features_variance(mat, num_feats = 0.05, normalize = NULL, threads = 1L)
+select_features_variance(
+  mat,
+  num_feats = 0.05,
+  normalize = NULL,
+  threads = 1L,
+  verbose = FALSE
+)
 
 select_features_dispersion(
   mat,
   num_feats = 0.05,
   normalize = NULL,
-  threads = 1L
+  threads = 1L,
+  verbose = FALSE
 )
 
-select_features_mean(mat, num_feats = 0.05, normalize = NULL, threads = 1L)
+select_features_mean(
+  mat,
+  num_feats = 0.05,
+  normalize = NULL,
+  threads = 1L,
+  verbose = FALSE
+)
 
 select_features_binned_dispersion(
   mat,
   num_feats = 25000,
   n_bins = 20,
-  threads = 1L
+  threads = 1L,
+  verbose = FALSE
 )
 }
 \arguments{
@@ -39,8 +53,7 @@ selection. If \code{NULL}, no normalization is performed. @seealso \code{normali
 \item{threads}{(integer) Number of threads to use.}
 
 \item{n_bins}{(integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin,
-and if the number of features
-within a bin is less than 2, the dispersion is set to 1.}
+and if the number of features within a bin is less than 2, the dispersion is set to 1.}
 }
 \value{
 Return a dataframe with the following columns:
@@ -66,7 +79,7 @@ each feature \eqn{x_i} as follows:
 }
 
 \itemize{
-\item \code{select_features_binned_dispersion}: Score representing the bin normalized dispersion of each feature.
+\item \code{select_features_binned_dispersion}: Process described in \code{details}.
 }
 }
 \description{

From dd03ecc875ad8e72046363f2ab3224424ebb94da Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 3 Feb 2025 12:01:16 -0800
Subject: [PATCH 059/142] [r] update partial code styling in feature selection,
 normalization

---
 r/R/singlecell_utils.R | 74 ++++++++++++++++++++----------------------
 r/R/transforms.R       | 25 +++++---------
 2 files changed, 44 insertions(+), 55 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 0f91898c..521c4de0 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -44,13 +44,11 @@ select_features_variance <- function(
   assert_len(num_feats, 1)
   assert_is(num_feats, "numeric")
   if (rlang::is_missing(mat)) {
-    return(create_partial(
-      missing_args = list(
-        num_feats = missing(num_feats),
-        normalize = missing(normalize), 
-        threads = missing(threads)
-      )
-    ))
+    return(create_partial(missing_args = list(
+      num_feats = missing(num_feats),
+      normalize = missing(normalize), 
+      threads = missing(threads)
+    )))
   }
   assert_is(mat, "IterableMatrix")
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
@@ -83,13 +81,11 @@ select_features_dispersion <- function(
   assert_len(num_feats, 1)
   assert_is(num_feats, "numeric")
   if (rlang::is_missing(mat)) {
-    return(create_partial(
-      missing_args = list(
-        num_feats = missing(num_feats),
-        normalize = missing(normalize), 
-        threads = missing(threads)
-      )
-    ))
+    return(create_partial(missing_args = list(
+      num_feats = missing(num_feats),
+      normalize = missing(normalize), 
+      threads = missing(threads)
+    )))
   }
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
   if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
@@ -116,13 +112,11 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread
   assert_greater_than_zero(num_feats)
   assert_is(num_feats, "numeric")
   if (rlang::is_missing(mat)) {
-    return(create_partial(
-      missing_args = list(
-        num_feats = missing(num_feats),
-        normalize = missing(normalize), 
-        threads = missing(threads)
-      )
-    ))
+    return(create_partial(missing_args = list(
+      num_feats = missing(num_feats),
+      normalize = missing(normalize), 
+      threads = missing(threads)
+    )))
   }
   assert_is(mat, "IterableMatrix")
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
@@ -155,12 +149,21 @@ select_features_binned_dispersion <- function(
   mat, num_feats = 25000, n_bins = 20,
   threads = 1L, verbose = FALSE
 ) {
-  assert_is(mat, "IterableMatrix")
+  
   assert_greater_than_zero(num_feats)
   assert_len(num_feats, 1)
   assert_is_wholenumber(n_bins)
   assert_len(n_bins, 1)
   assert_greater_than_zero(n_bins)
+  if (rlang::is_missing(mat)) {
+    return(create_partial(missing_args = list(
+      num_feats = missing(num_feats),
+      n_bins = missing(n_bins),
+      threads = missing(threads),
+      verbose = missing(verbose)
+    )))
+  }
+  assert_is(mat, "IterableMatrix")
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
   if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
     if (verbose) log_progress(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat)))
@@ -212,11 +215,10 @@ select_features_binned_dispersion <- function(
 DimReduction <- function(x, fitted_params = list(), ...) {
   assert_is(x, c("IterableMatrix", "dgCMatrix", "matrix"))
   assert_is(fitted_params, "list")
-  structure(
-    list(
-      cell_embeddings = x,
-      fitted_params = fitted_params,
-      ...
+  structure(list(
+    cell_embeddings = x,
+    fitted_params = fitted_params,
+    ...
     ),
     class = "DimReduction"
   )
@@ -276,16 +278,12 @@ LSI <- function(
   threads = 1L, verbose = FALSE
 ) {
   if (rlang::is_missing(mat)) {
-    return(
-      create_partial(
-        missing_args = list(
-          n_dimensions = missing(n_dimensions),
-          corr_cutoff = missing(corr_cutoff), 
-          threads = missing(threads),
-          verbose = missing(verbose)
-        )
-      )
-    )
+    return(create_partial(missing_args = list(
+      n_dimensions = missing(n_dimensions),
+      corr_cutoff = missing(corr_cutoff), 
+      threads = missing(threads),
+      verbose = missing(verbose)
+    )))
   }
   assert_is(mat, "IterableMatrix")
   assert_is_wholenumber(n_dimensions)
@@ -466,7 +464,7 @@ IterativeLSI <- function(
     fitted_params$iter_info$clusters[[i]] <- clustering_res
     # pseudobulk and pass onto next iteration
     if (verbose) log_progress("Pseudobulking matrix")
-    pseudobulk_res <- pseudobulk_matrix(mat, clustering_res, threads = as.integer(threads)) %>% as("dgCMatrix") %>% as("IterableMatrix")
+    pseudobulk_res <- pseudobulk_matrix(mat, clustering_res, threads = as.integer(threads))
     # Only take the SVD information required to project the matrix
     fitted_params$iter_info$lsi_results[[i]]$svd_params <- list(
       u = lsi_res_obj$fitted_params$svd_params$u
diff --git a/r/R/transforms.R b/r/R/transforms.R
index 0004126c..b9db2be3 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -949,14 +949,10 @@ normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
   assert_is_numeric(scale_factor)
   assert_greater_than_zero(scale_factor)
   if (rlang::is_missing(mat)) {
-    return(
-      create_partial(
-        missing_args = list(
-          scale_factor = missing(scale_factor), 
-          threads = missing(threads)
-        )
-      )
-    ) 
+    return(create_partial(missing_args = list(
+      scale_factor = missing(scale_factor), 
+      threads = missing(threads)
+    ))) 
   }
   assert_is(mat, "IterableMatrix")
   read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
@@ -978,15 +974,10 @@ normalize_tfidf <- function(
 ) {
   assert_is_wholenumber(threads)
   if (rlang::is_missing(mat)) {
-    return(
-      create_partial(
-        missing_args = list(
-          feature_means = missing(feature_means), 
-          scale_factor = missing(scale_factor), 
-          threads = missing(threads)
-      )
-      )
-    )
+    return(create_partial(missing_args = list(
+      feature_means = missing(feature_means),
+      threads = missing(threads)
+    )))
   }
   assert_is(mat, "IterableMatrix")
   # If feature means are passed in, only need to calculate term frequency

From 83d8877bcf18934a07689caf3e8bf46c14684bea Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 3 Feb 2025 14:51:42 -0800
Subject: [PATCH 060/142] [r] fix matrix flexibility in normalization, feature
 selection, lsi

---
 r/R/singlecell_utils.R | 4 ++++
 r/R/transforms.R       | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 521c4de0..669b4946 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -50,6 +50,7 @@ select_features_variance <- function(
       threads = missing(threads)
     )))
   }
+  if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
   assert_is(mat, "IterableMatrix")
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
   if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
@@ -92,6 +93,7 @@ select_features_dispersion <- function(
     if (verbose) log_progress(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat)))
     num_feats <- min(max(num_feats, 0), nrow(mat))
   }
+  if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
   assert_is(mat, "IterableMatrix")
   if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
   mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads)
@@ -118,6 +120,7 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread
       threads = missing(threads)
     )))
   }
+  if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
   assert_is(mat, "IterableMatrix")
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
   if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
@@ -163,6 +166,7 @@ select_features_binned_dispersion <- function(
       verbose = missing(verbose)
     )))
   }
+  if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
   assert_is(mat, "IterableMatrix")
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
   if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
diff --git a/r/R/transforms.R b/r/R/transforms.R
index b9db2be3..25f9c225 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -952,8 +952,9 @@ normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
     return(create_partial(missing_args = list(
       scale_factor = missing(scale_factor), 
       threads = missing(threads)
-    ))) 
+    )))
   }
+  if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
   assert_is(mat, "IterableMatrix")
   read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
   mat <- mat %>% multiply_cols(1 / read_depth)
@@ -979,6 +980,7 @@ normalize_tfidf <- function(
       threads = missing(threads)
     )))
   }
+  if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
   assert_is(mat, "IterableMatrix")
   # If feature means are passed in, only need to calculate term frequency
   if (is.null(feature_means)) {

From 1093cf5fdff0556b33846b5c6e552e1d4d7e4bb2 Mon Sep 17 00:00:00 2001
From: Ben Parks <bnprks+git@gmail.com>
Date: Wed, 5 Feb 2025 21:09:25 -0800
Subject: [PATCH 061/142] docs updates

---
 r/R/singlecell_utils.R     | 31 +++++++++++++++++++++++++++----
 r/R/transforms.R           | 10 +++++++---
 r/man/feature_selection.Rd | 32 ++++++++++++++++++++++++++++----
 r/man/normalize.Rd         | 10 +++++++---
 4 files changed, 69 insertions(+), 14 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 669b4946..93f9fc9d 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -33,6 +33,26 @@
 #' For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, determine the score of
 #' each feature \eqn{x_i} as follows:
 #' - `select_features_by_variance`: \eqn{\mathrm{Score}(x_i) = \frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}
+#' @examples
+#' set.seed(12345)
+#' mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
+#' rownames(mat) <- paste0("gene", seq_len(nrow(mat)))
+#' mat
+#'
+#' select_features_variance(
+#'     mat, 
+#'     num_feats=2, 
+#'     normalize=normalize_log
+#' )
+#' 
+#' # Because of how the BPCells normalize functions behave when the matrix 
+#' # argument is missing, we can also customize the normalization parameters:
+#' select_features_variance(
+#'     mat,
+#'     num_feats=2,
+#'     normalize=normalize_log(scale_factor=20)
+#' ) 
+#'
 #' @export
 select_features_variance <- function(
   mat, num_feats = 0.05, 
@@ -143,10 +163,13 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread
 #' @returns
 #'  - `select_features_binned_dispersion`: Process described in `details`.
 #' @details 
-#' `select_features_binned_dispersion` calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
-#'  1. Calculate the dispersion of each feature (variance / mean)
-#'  2. Log normalize dispersion and mean
-#'  3. Bin the features by their means, and normalize dispersion within each bin
+#' `select_features_binned_dispersion` implements the approach from Satija et al. 2015:
+#'  1. Bin features into equal-width bins by `log1p(mean)`
+#'  2. Calculate dispersion of each feature as `log(variance / mean)`
+#'  3. Z-score normalize dispersion within each bin, and select highest normalized dispersion across all bins
+#' 
+#' This should be equivalent to `Seurat::FindVariableFeatures()` with `selection.method="mean.var.plot"`
+#'  and `scanpy.pp.highly_variable_genes()` with `flavor="seurat"`.
 #' @export
 select_features_binned_dispersion <- function(
   mat, num_feats = 25000, n_bins = 20,
diff --git a/r/R/transforms.R b/r/R/transforms.R
index 25f9c225..63e23cbb 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -941,9 +941,13 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #'  
 #'   - `normalize_log`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
 #' @details 
+#' **Passing to `normalize` parameters with non-default arguments**
+#' 
 #' If the `mat` argument is missing, returns a "partial" function: a copy of the original function but with most arguments pre-defined. 
-#' This can be used to customize `normalize` parameters in other single cell functions in BPCells (e.g. `select_features_mean()`)
-#' - `normalize_log`: Corresponds to `Seurat::NormalizeLog`
+#' This can be used to customize `normalize` parameters in other single cell functions in BPCells (e.g. `select_features_mean()`).
+#' 
+#' **Related functions from other packages**
+#' - `normalize_log`: Corresponds to `Seurat::NormalizeData()` with its default "LogNormalize" method.
 #' @export
 normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
   assert_is_numeric(scale_factor)
@@ -967,7 +971,7 @@ normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
 #' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
 #' Else, map each feature name to its mean value.
 #' @returns - `normalize_tfidf`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
-#' @details - `normalize_tfidf`: This follows the formula from Stuart, Butler et al. 2019, used by default in `ArchR::addIterativeLSI()` and `Signac::RunTFIDF()`
+#' @details - `normalize_tfidf`: This follows the formula from Stuart, Butler et al. 2019, matching the default behavior of `Signac::RunTFIDF()`. This also matches the normalization used within `ArchR::addIterativeLSI()`, but with `binarize = FALSE`. 
 #' @export
 normalize_tfidf <- function(
   mat, feature_means = NULL,
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index 02dd31b8..c17319e6 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -86,10 +86,34 @@ each feature \eqn{x_i} as follows:
 Apply a feature selection method to a \verb{(features x cells)} matrix.
 }
 \details{
-\code{select_features_binned_dispersion} calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
+\code{select_features_binned_dispersion} implements the approach from Satija et al. 2015:
 \enumerate{
-\item Calculate the dispersion of each feature (variance / mean)
-\item Log normalize dispersion and mean
-\item Bin the features by their means, and normalize dispersion within each bin
+\item Bin features into equal-width bins by \code{log1p(mean)}
+\item Calculate dispersion of each feature as \code{log(variance / mean)}
+\item Z-score normalize dispersion within each bin, and select highest normalized dispersion across all bins
 }
+
+This should be equivalent to \code{Seurat::FindVariableFeatures()} with \code{selection.method="mean.var.plot"}
+and \code{scanpy.pp.highly_variable_genes()} with \code{flavor="seurat"}.
+}
+\examples{
+set.seed(12345)
+mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
+rownames(mat) <- paste0("gene", seq_len(nrow(mat)))
+mat
+
+select_features_variance(
+    mat, 
+    num_feats=2, 
+    normalize=normalize_log
+)
+
+# Because of how the BPCells normalize functions behave when the matrix 
+# argument is missing, we can also customize the normalization parameters:
+select_features_variance(
+    mat,
+    num_feats=2,
+    normalize=normalize_log(scale_factor=20)
+) 
+
 }
diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd
index 5821a2c8..f6467dca 100644
--- a/r/man/normalize.Rd
+++ b/r/man/normalize.Rd
@@ -35,13 +35,17 @@ transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as:
 Apply standard normalizations to a \verb{(features x cells)} counts matrix.
 }
 \details{
+\strong{Passing to \code{normalize} parameters with non-default arguments}
+
 If the \code{mat} argument is missing, returns a "partial" function: a copy of the original function but with most arguments pre-defined.
-This can be used to customize \code{normalize} parameters in other single cell functions in BPCells (e.g. \code{select_features_mean()})
+This can be used to customize \code{normalize} parameters in other single cell functions in BPCells (e.g. \code{select_features_mean()}).
+
+\strong{Related functions from other packages}
 \itemize{
-\item \code{normalize_log}: Corresponds to \code{Seurat::NormalizeLog}
+\item \code{normalize_log}: Corresponds to \code{Seurat::NormalizeData()} with its default "LogNormalize" method.
 }
 
 \itemize{
-\item \code{normalize_tfidf}: This follows the formula from Stuart, Butler et al. 2019, used by default in \code{ArchR::addIterativeLSI()} and \code{Signac::RunTFIDF()}
+\item \code{normalize_tfidf}: This follows the formula from Stuart, Butler et al. 2019, matching the default behavior of \code{Signac::RunTFIDF()}. This also matches the normalization used within \code{ArchR::addIterativeLSI()}, but with \code{binarize = FALSE}.
 }
 }

From 82c2ec765b240763d72dff5638b597a4ea4d6557 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 6 Feb 2025 16:01:31 -0800
Subject: [PATCH 062/142] [r] add in timestamp logging, fix partial creation

---
 r/R/singlecell_utils.R                   | 102 ++++++++---------------
 r/R/transforms.R                         |  17 +---
 r/R/utils.R                              |  22 ++++-
 r/tests/testthat/test-singlecell_utils.R |  20 +----
 4 files changed, 59 insertions(+), 102 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 93f9fc9d..c599403f 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -32,7 +32,7 @@
 #' Each different feature selection method will have a different scoring method.
 #' For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, determine the score of
 #' each feature \eqn{x_i} as follows:
-#' - `select_features_by_variance`: \eqn{\mathrm{Score}(x_i) = \frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}
+#' - `select_features_variance`: \eqn{\mathrm{Score}(x_i) = \frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}
 #' @examples
 #' set.seed(12345)
 #' mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
@@ -63,22 +63,15 @@ select_features_variance <- function(
   assert_greater_than_zero(num_feats)
   assert_len(num_feats, 1)
   assert_is(num_feats, "numeric")
-  if (rlang::is_missing(mat)) {
-    return(create_partial(missing_args = list(
-      num_feats = missing(num_feats),
-      normalize = missing(normalize), 
-      threads = missing(threads)
-    )))
-  }
+  if (rlang::is_missing(mat)) return(create_partial())
   if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
   assert_is(mat, "IterableMatrix")
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
   if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
-    if (verbose) log_progress(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat)))
-    num_feats <- min(max(num_feats, 0), nrow(mat))
+    rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat))))
   }
   num_feats <- min(max(num_feats, 0), nrow(mat))
-  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
+  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads, verbose = verbose)(mat)
   features_df <- tibble::tibble(
     names = rownames(mat),
     score = matrix_stats(mat, row_stats = "variance", threads = threads)$row_stats["variance",]
@@ -90,7 +83,7 @@ select_features_variance <- function(
 
 #' @rdname feature_selection
 #' @returns
-#'  - `select_features_by_dispersion`: \eqn{\mathrm{Score}(x_i) = \frac{\frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}{\bar{x}_i}}
+#'  - `select_features_dispersion`: \eqn{\mathrm{Score}(x_i) = \frac{\frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}{\bar{x}_i}}
 #' @export
 select_features_dispersion <- function(
   mat, num_feats = 0.05, 
@@ -101,21 +94,15 @@ select_features_dispersion <- function(
   assert_greater_than_zero(num_feats)
   assert_len(num_feats, 1)
   assert_is(num_feats, "numeric")
-  if (rlang::is_missing(mat)) {
-    return(create_partial(missing_args = list(
-      num_feats = missing(num_feats),
-      normalize = missing(normalize), 
-      threads = missing(threads)
-    )))
-  }
+  if (rlang::is_missing(mat)) return(create_partial())
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
   if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
-    if (verbose) log_progress(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat)))
-    num_feats <- min(max(num_feats, 0), nrow(mat))
+    rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat))))
   }
+  num_feats <- min(max(num_feats, 0), nrow(mat))
   if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
   assert_is(mat, "IterableMatrix")
-  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
+  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads, verbose = verbose)(mat)
   mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads)
   features_df <- tibble::tibble(
     names = rownames(mat),
@@ -128,26 +115,20 @@ select_features_dispersion <- function(
 
 #' @rdname feature_selection
 #' @returns
-#' - `select_features_by_mean`: \eqn{\mathrm{Score}(x_i) = \bar{x}_i}
+#' - `select_features_mean`: \eqn{\mathrm{Score}(x_i) = \bar{x}_i}
 #' @export
 select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, threads = 1L, verbose = FALSE) {
   assert_greater_than_zero(num_feats)
   assert_is(num_feats, "numeric")
-  if (rlang::is_missing(mat)) {
-    return(create_partial(missing_args = list(
-      num_feats = missing(num_feats),
-      normalize = missing(normalize), 
-      threads = missing(threads)
-    )))
-  }
+  if (rlang::is_missing(mat)) return(create_partial())
   if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
   assert_is(mat, "IterableMatrix")
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
   if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
-    if (verbose) log_progress(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat)))
-    num_feats <- min(max(num_feats, 0), nrow(mat))
+    rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat))))
   }
-  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
+  num_feats <- min(max(num_feats, 0), nrow(mat))
+  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads, verbose = verbose)(mat)
   # get the sum of each feature, binarized
   features_df <- tibble::tibble(
     names = rownames(mat),
@@ -172,30 +153,22 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread
 #'  and `scanpy.pp.highly_variable_genes()` with `flavor="seurat"`.
 #' @export
 select_features_binned_dispersion <- function(
-  mat, num_feats = 25000, n_bins = 20,
+  mat, num_feats = 0.05, n_bins = 20,
   threads = 1L, verbose = FALSE
 ) {
-  
   assert_greater_than_zero(num_feats)
   assert_len(num_feats, 1)
   assert_is_wholenumber(n_bins)
   assert_len(n_bins, 1)
   assert_greater_than_zero(n_bins)
-  if (rlang::is_missing(mat)) {
-    return(create_partial(missing_args = list(
-      num_feats = missing(num_feats),
-      n_bins = missing(n_bins),
-      threads = missing(threads),
-      verbose = missing(verbose)
-    )))
-  }
+  if (rlang::is_missing(mat)) return(create_partial())
   if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
   assert_is(mat, "IterableMatrix")
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
   if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
-    if (verbose) log_progress(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat)))
-    num_feats <- min(max(num_feats, 0), nrow(mat))
+    rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat))))
   }
+  num_feats <- min(max(num_feats, 0), nrow(mat))
   # Calculate row information for dispersion
   mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
   feature_means <- mat_stats$row_stats["mean", ]
@@ -208,18 +181,18 @@ select_features_binned_dispersion <- function(
   feature_means <- log1p(feature_means)
   features_df <- tibble::tibble(
     names = names(feature_means),
-    var = feature_vars, 
+    var = feature_vars,
     mean = feature_means,
     dispersion = feature_dispersion
   )
   # Bin by mean, and normalize dispersion with each bin
-  features_df <- features_df %>% 
+  features_df <- features_df %>%
     dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% 
-    dplyr::group_by(bin) %>% 
-    dplyr::mutate( 
+    dplyr::group_by(bin) %>%
+    dplyr::mutate(
       score = (dispersion - mean(dispersion)) / sd(dispersion),
       score = if (dplyr::n() == 1) {1} else {score} # Set feats that are in bins with only one feat to have a norm dispersion of 1  
-    ) %>% 
+    ) %>%
     dplyr::ungroup() %>%
     dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats) %>% 
     dplyr::select(c("names", "score", "highly_variable"))
@@ -305,12 +278,7 @@ LSI <- function(
   threads = 1L, verbose = FALSE
 ) {
   if (rlang::is_missing(mat)) {
-    return(create_partial(missing_args = list(
-      n_dimensions = missing(n_dimensions),
-      corr_cutoff = missing(corr_cutoff), 
-      threads = missing(threads),
-      verbose = missing(verbose)
-    )))
+    return(create_partial())
   }
   assert_is(mat, "IterableMatrix")
   assert_is_wholenumber(n_dimensions)
@@ -325,8 +293,7 @@ LSI <- function(
   if (verbose) log_progress("Normalizing matrix")
   mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"), threads = threads)
   read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
-  mat <- partial_apply(
-    normalize_tfidf,
+  mat <- normalize_tfidf(
     feature_means = mat_stats$row_stats["mean", ],
     scale_factor = scale_factor,
     threads = threads
@@ -398,8 +365,8 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' Given a `(features x cells)` matrix, Compute an iterative LSI dimensionality reduction, using the method described in [ArchR](https://doi.org/10.1038/s41588-021-00790-6) (Granja et al; 2019).
 #' @param mat (IterableMatrix) 
 #' @param n_iterations (int) The number of LSI iterations to perform.
-#' @param first_feature_selection_method (function) Method to use for selecting features for the first iteration. Current builtin options are `select_features_by_variance`, `select_features_by_dispersion`, `select_features_by_mean`, `select_features_binned_dispersion`
-#' @param feature_selection_method (function) Method to use for selecting features for each iteration after the first. Current builtin options are `select_features_by_variance`, `select_features_by_dispersion`, `select_features_by_mean`, `select_features_binned_dispersion`
+#' @param first_feature_selection_method (function) Method to use for selecting features for the first iteration. Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_mean`, `select_features_binned_dispersion`
+#' @param feature_selection_method (function) Method to use for selecting features for each iteration after the first. Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_mean`, `select_features_binned_dispersion`
 #' @param cluster_method (function) Method to use for clustering. Current builtin options are `cluster_graph_{leiden, louvain, seurat}()`
 #' @param lsi_method (function) Method to use for LSI.  Only `LSI` is allowed.  The user can pass in partial parameters to `LSI` to customize the LSI method, such as by passing `LSI(n_dimensions = 30, corr_cutoff = 0.5)`.
 #' @return An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
@@ -437,7 +404,7 @@ IterativeLSI <- function(
   feature_selection_method = select_features_dispersion,
   lsi_method = LSI, # Make only allowed to be LSI
   cluster_method = cluster_graph_leiden,
-  verbose = FALSE, threads = 1L
+  threads = 1L, verbose = FALSE
 ) {
   assert_is(mat, "IterableMatrix")
   assert_true(n_iterations > 0)
@@ -478,7 +445,11 @@ IterativeLSI <- function(
     }
     # run LSI
     if (verbose) log_progress("Running LSI")
-    lsi_res_obj <- lsi_method(mat[mat_indices,], threads = threads)
+    lsi_res_obj <- partial_apply(
+      lsi_method,
+      threads = threads,
+      verbose = verbose
+    )(mat[mat_indices,])
     fitted_params$iter_info$lsi_results[[i]] <- lsi_res_obj$fitted_params
     # remove the feature means from the lsi results as they are already calculated
     # save minimum info for lsi results if not onn terminal iteration
@@ -510,7 +481,6 @@ IterativeLSI <- function(
 #' @export
 project.IterativeLSI <- function(x, mat, threads = 1L, ...) {
   assert_is(mat, "IterableMatrix")
-  assert_is(x, "IterativeLSI")
   fitted_params <- x$fitted_params
   # Get the final row of fitted params
   last_iter_info <- fitted_params$iter_info[nrow(fitted_params$iter_info),]
@@ -530,13 +500,12 @@ project.IterativeLSI <- function(x, mat, threads = 1L, ...) {
   # since we don't hold the LSI object, we copy the internal logic from `project.LSI()`
   lsi_attr <- attr(x$fitted_params$lsi_method, "args")
   
-  func <- partial_apply(
-    normalize_tfidf,
+  mat <- normalize_tfidf(
+    mat = mat,
     feature_means = fitted_params$feature_means,
     scale_factor = lsi_attr$scale_factor,
     threads = threads
   )
-  mat <- func(mat)
   mat <- write_matrix_dir(
     convert_matrix_type(mat, type = "float"),
     tempfile("mat"), compress = TRUE
@@ -614,7 +583,6 @@ marker_features <- function(mat, groups, method="wilcoxon") {
     )
 }
 
-
                    
 #' Aggregate counts matrices by cell group or feature.
 #'
diff --git a/r/R/transforms.R b/r/R/transforms.R
index 63e23cbb..8b8ab7f5 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -950,14 +950,9 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #' - `normalize_log`: Corresponds to `Seurat::NormalizeData()` with its default "LogNormalize" method.
 #' @export
 normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
-  assert_is_numeric(scale_factor)
   assert_greater_than_zero(scale_factor)
-  if (rlang::is_missing(mat)) {
-    return(create_partial(missing_args = list(
-      scale_factor = missing(scale_factor), 
-      threads = missing(threads)
-    )))
-  }
+  assert_is_wholenumber(threads)
+  if (rlang::is_missing(mat)) return(create_partial())
   if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
   assert_is(mat, "IterableMatrix")
   read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
@@ -977,13 +972,9 @@ normalize_tfidf <- function(
   mat, feature_means = NULL,
   scale_factor = 1e4, threads = 1L
 ) {
+  assert_greater_than_zero(scale_factor)
   assert_is_wholenumber(threads)
-  if (rlang::is_missing(mat)) {
-    return(create_partial(missing_args = list(
-      feature_means = missing(feature_means),
-      threads = missing(threads)
-    )))
-  }
+  if (rlang::is_missing(mat)) return(create_partial())
   if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
   assert_is(mat, "IterableMatrix")
   # If feature means are passed in, only need to calculate term frequency
diff --git a/r/R/utils.R b/r/R/utils.R
index d7c65358..3133ed8c 100644
--- a/r/R/utils.R
+++ b/r/R/utils.R
@@ -45,11 +45,16 @@ document_granges <- function(
   ), intro_noun, bullets)
 }
 
+# Add current timestamp to a character string.
+add_timestamp <- function(msg) {
+  return(paste0(format(Sys.time(), "%Y-%m-%d %H:%M:%S "), msg))
+}
+
 # Function which prints a message using shell echo.
 # Useful for printing messages from inside mclapply when running in Rstudio.
 log_progress <- function(msg, add_timestamp = TRUE){
   if (add_timestamp) {
-    msg <- paste0(format(Sys.time(), "%Y-%m-%d %H:%M:%S "), msg)
+    msg <- add_timestamp(msg)
   }
   if (.Platform$GUI == "RStudio") {
     system(sprintf('echo "%s"', paste0(msg, collapse="")))
@@ -96,9 +101,12 @@ create_partial <- function(missing_args=list()) {
 #' @param .overwrite (bool) If `f` is already an output from
 #'   `partial_apply()`, whether parameter re-definitions should
 #'   be ignored or overwrite the existing definitions
+#' @param .missing_args_error (bool) If `TRUE`, passing in arguments
+#'  that are not in the function's signature will raise an error, otherwise
+#'  they will be ignored
 #' @return A `bpcells_partial` object (a function with some extra attributes)
 #' @keywords internal
-partial_apply <- function(f, ..., .overwrite=FALSE) {
+partial_apply <- function(f, ..., .overwrite = TRUE, .missing_args_error = FALSE) {
   args <- rlang::list2(...)
 
   if (is(f, "bpcells_partial")) {
@@ -113,6 +121,14 @@ partial_apply <- function(f, ..., .overwrite=FALSE) {
   } else {
     function_name <- rlang::sym(rlang::caller_arg(f))
   }
+  # See which arguments do not exist in f
+  missing_args <- which(!names(args) %in% names(formals(f)))
+  if (length(missing_args) > 0) {
+    if (.missing_args_error) {
+      stop(sprintf("Arguments %s are not in the function signature", paste0(names(args)[missing_args], collapse=", ")))
+    } else {
+      args <- args[-missing_args]}
+  }
   partial_fn <- do.call(purrr::partial, c(f, args))
   attr(partial_fn, "body")[[1]] <- function_name
   structure(
@@ -121,4 +137,4 @@ partial_apply <- function(f, ..., .overwrite=FALSE) {
     args = args,
     fn = f
   )
-}
\ No newline at end of file
+}
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 3151624f..27e57055 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -28,7 +28,7 @@ test_that("select_features works general case", {
     expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting
     expect_equal(sum(res$highly_variable), 5) # Only 10 features marked as highly variable
     expect_setequal(res$names, rownames(m1))
-    res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows
+    res_more_feats_than_rows <- suppressWarnings(do.call(fn, list(m1, num_feats = 10000))) # more features than rows
     res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100))
     res_feats_partial <- get(fn)(num_feats = 100)(m1)
     expect_identical(res_feats_equal_rows, res_feats_partial)
@@ -233,24 +233,6 @@ test_that("LSI works", {
     expect_equal(lsi_res, lsi_res_proj)
 })
 
-
-test_that("Feature selection by bin variance works", {
-    mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix")
-    # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
-    res_table <- select_features_binned_dispersion(mat, num_feats = 10, n_bins = 5, threads = 1)
-    res_table_t <- select_features_binned_dispersion(t(mat), num_feats = 10, n_bins = 5, threads = 1)
-    res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) 
-    res <- mat[res_feats,]
-    res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(names)
-    res_t <- t(mat[,res_feats_t])
-    
-    expect_equal(nrow(res), 10)
-    expect_equal(ncol(res), 26)
-    expect_equal(nrow(res_t), 10)
-    expect_equal(ncol(res_t), 500)
-})
-
-
 test_that("Iterative LSI works", {
   mat <- matrix(data = runif(50000, 0, 1), nrow=500, ncol = 100) %>% as("dgCMatrix") %>% as("IterableMatrix")
   rownames(mat) <- paste0("feat", seq_len(nrow(mat)))

From d9328c0f60222aa289b116a6d2b24173ead94243 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 6 Feb 2025 16:02:44 -0800
Subject: [PATCH 063/142] [r] update docs

---
 r/NEWS.md                  | 2 +-
 r/man/IterativeLSI.Rd      | 8 ++++----
 r/man/feature_selection.Rd | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/r/NEWS.md b/r/NEWS.md
index 8c89326a..f2d08307 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -49,7 +49,7 @@ of the new features this release and will continue to help with maintenance and
   If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128)
 - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
 - Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #189)
-- Add feature selection functions `select_features_by_{variance,dispersion,mean,binned_dispersion}()`, with parameterization for normalization steps, and number of variable features (pull request #189)
+- Add feature selection functions `select_features_variance()`, `select_features_{variance,dispersion,mean,binned_dispersion}()`, with parameterization for normalization steps, and number of variable features (pull request #189)
 - Add `LSI()` and `IterativeLSI()` dimensionality functions to perform latent semantic indexing on a matrix (pull request #189).
 
 ## Improvements
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index 5208c78d..99511206 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -11,8 +11,8 @@ IterativeLSI(
   feature_selection_method = select_features_dispersion,
   lsi_method = LSI,
   cluster_method = cluster_graph_leiden,
-  verbose = FALSE,
-  threads = 1L
+  threads = 1L,
+  verbose = FALSE
 )
 }
 \arguments{
@@ -20,9 +20,9 @@ IterativeLSI(
 
 \item{n_iterations}{(int) The number of LSI iterations to perform.}
 
-\item{first_feature_selection_method}{(function) Method to use for selecting features for the first iteration. Current builtin options are \code{select_features_by_variance}, \code{select_features_by_dispersion}, \code{select_features_by_mean}, \code{select_features_binned_dispersion}}
+\item{first_feature_selection_method}{(function) Method to use for selecting features for the first iteration. Current builtin options are \code{select_features_variance}, \code{select_features_dispersion}, \code{select_features_mean}, \code{select_features_binned_dispersion}}
 
-\item{feature_selection_method}{(function) Method to use for selecting features for each iteration after the first. Current builtin options are \code{select_features_by_variance}, \code{select_features_by_dispersion}, \code{select_features_by_mean}, \code{select_features_binned_dispersion}}
+\item{feature_selection_method}{(function) Method to use for selecting features for each iteration after the first. Current builtin options are \code{select_features_variance}, \code{select_features_dispersion}, \code{select_features_mean}, \code{select_features_binned_dispersion}}
 
 \item{lsi_method}{(function) Method to use for LSI.  Only \code{LSI} is allowed.  The user can pass in partial parameters to \code{LSI} to customize the LSI method, such as by passing \code{LSI(n_dimensions = 30, corr_cutoff = 0.5)}.}
 
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index c17319e6..83d0f27d 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -33,7 +33,7 @@ select_features_mean(
 
 select_features_binned_dispersion(
   mat,
-  num_feats = 25000,
+  num_feats = 0.05,
   n_bins = 20,
   threads = 1L,
   verbose = FALSE
@@ -67,15 +67,15 @@ Each different feature selection method will have a different scoring method.
 For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, determine the score of
 each feature \eqn{x_i} as follows:
 \itemize{
-\item \code{select_features_by_variance}: \eqn{\mathrm{Score}(x_i) = \frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}
+\item \code{select_features_variance}: \eqn{\mathrm{Score}(x_i) = \frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}
 }
 
 \itemize{
-\item \code{select_features_by_dispersion}: \eqn{\mathrm{Score}(x_i) = \frac{\frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}{\bar{x}_i}}
+\item \code{select_features_dispersion}: \eqn{\mathrm{Score}(x_i) = \frac{\frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}{\bar{x}_i}}
 }
 
 \itemize{
-\item \code{select_features_by_mean}: \eqn{\mathrm{Score}(x_i) = \bar{x}_i}
+\item \code{select_features_mean}: \eqn{\mathrm{Score}(x_i) = \bar{x}_i}
 }
 
 \itemize{

From c926219a6abd8deb9468a25da45a46d2713034ac Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 6 Feb 2025 17:34:07 -0800
Subject: [PATCH 064/142] [r] add in additional function for matrix
 coercibility

---
 r/R/errorChecking.R    | 12 +++++++++++-
 r/man/partial_apply.Rd |  6 +++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/r/R/errorChecking.R b/r/R/errorChecking.R
index 7691ab98..d65be9fb 100644
--- a/r/R/errorChecking.R
+++ b/r/R/errorChecking.R
@@ -113,7 +113,17 @@ assert_is <- function(object, class, n = 1) {
     if (!match) pretty_error(object, sprintf("must have class %s", paste0(class, collapse = ", or ")), n)
   }
 }
-
+assert_is_mat <- function(object, n = 1) {
+  if (length(object) == 1) {
+    if (!is(object, "IterableMatrix") && !canCoerce(mat, "IterableMatrix"))
+      pretty_error(object, "must either be an IterableMatrix or coercible to an IterableMatrix", n)
+  } else {
+    for (mat in object) {
+      if (!is(mat, "IterableMatrix") && !canCoerce(mat, "IterableMatrix"))
+        pretty_error(mat, "must either be an IterableMatrix or coercible to an IterableMatrix", n)
+    }
+  }
+}
 assert_true <- function(expr, n = 1) {
   if (!expr) pretty_error(expr, "is not true", n)
 }
diff --git a/r/man/partial_apply.Rd b/r/man/partial_apply.Rd
index c1ea508d..abafbc0b 100644
--- a/r/man/partial_apply.Rd
+++ b/r/man/partial_apply.Rd
@@ -4,7 +4,7 @@
 \alias{partial_apply}
 \title{Create partial function calls}
 \usage{
-partial_apply(f, ..., .overwrite = FALSE)
+partial_apply(f, ..., .overwrite = TRUE, .missing_args_error = FALSE)
 }
 \arguments{
 \item{f}{A function}
@@ -14,6 +14,10 @@ partial_apply(f, ..., .overwrite = FALSE)
 \item{.overwrite}{(bool) If \code{f} is already an output from
 \code{partial_apply()}, whether parameter re-definitions should
 be ignored or overwrite the existing definitions}
+
+\item{.missing_args_error}{(bool) If \code{TRUE}, passing in arguments
+that are not in the function's signature will raise an error, otherwise
+they will be ignored}
 }
 \value{
 A \code{bpcells_partial} object (a function with some extra attributes)

From eaeb56ff6cb046fb00c80d1192c4c5198ce7867e Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 7 Feb 2025 12:56:54 -0800
Subject: [PATCH 065/142] [r] update NEWS

---
 r/NEWS.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/r/NEWS.md b/r/NEWS.md
index f2d08307..52b2e1fc 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -11,6 +11,9 @@ Contributions welcome :)
 ## Features
 - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166)
 - Add normalization helper functions `normalize_log()` and `normalize_tfidf()` (pull request #168)
+- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #189)
+- Add feature selection functions `select_features_variance()`, `select_features_{variance,dispersion,mean,binned_dispersion}()`, with parameterization for normalization steps, and number of variable features (pull request #189)
+- Add `LSI()` and `IterativeLSI()` dimensionality functions to perform latent semantic indexing on a matrix (pull request #189).
 
 ## Improvements
 - Speed up taking large subsets of large concatenated matrices, e.g. selecting 9M cells from a 10M cell matrix composed of ~100 concatenated pieces. (pull request #179)
@@ -48,9 +51,6 @@ of the new features this release and will continue to help with maintenance and
 - Add `rowQuantiles()` and `colQuantiles()` functions, which return the quantiles of each row/column of a matrix. Currently `rowQuantiles()` only works on row-major matrices and `colQuantiles()` only works on col-major matrices.
   If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128)
 - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
-- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #189)
-- Add feature selection functions `select_features_variance()`, `select_features_{variance,dispersion,mean,binned_dispersion}()`, with parameterization for normalization steps, and number of variable features (pull request #189)
-- Add `LSI()` and `IterativeLSI()` dimensionality functions to perform latent semantic indexing on a matrix (pull request #189).
 
 ## Improvements
 - `trackplot_loop()` now accepts discrete color scales

From b6a13dd0d891375ece6107979e965125a7f5324e Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 7 Feb 2025 13:26:14 -0800
Subject: [PATCH 066/142] [r] update feature selcection/normalization with PR
 suggestions

---
 r/R/singlecell_utils.R     | 26 ++++++++++++++------------
 r/R/transforms.R           |  8 ++++----
 r/man/{lsi.Rd => LSI.Rd}   |  2 +-
 r/man/feature_selection.Rd | 23 +++++++++++++----------
 4 files changed, 32 insertions(+), 27 deletions(-)
 rename r/man/{lsi.Rd => LSI.Rd} (94%)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index c599403f..e6b0c6d7 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -13,15 +13,15 @@
 
 #' Feature selection functions
 #' 
-#' Apply a feature selection method to a `(features x cells)` matrix.
+#' Apply a feature selection method to a non-normalized `(features x cells)` matrix.  We recommend using counts matrices as input and 
+#' apply any normalizations prior to feature selection via the normalize argument (if available).  The output of these functions is a dataframe that has columns that
+#' at the minimum include the feature names and a score for each feature. 
 #' @rdname feature_selection
 #' @param mat (IterableMatrix) dimensions features x cells
-#' @param num_feats (float) Number of features to return.  If the number given is between 0 and 1, treat as a proportion of 
-#' the number of rows, rounded down.  Otherwise, treat as an absolute number.
-#' If the number is higher than the number of features in the matrix, 
-#' all features will be returned.
-#' @param normalize (function) Normalize matrix using a given function. Normalization occurs on the input mat prior to feature 
-#' selection. If `NULL`, no normalization is performed. @seealso `normalize_tfidf()` `normalize_log()`
+#' @param num_feats (float) Number of features to mark as highly_variable. If 0 < num_feats < 1, then interpret it as a fraction of features.
+#' @param normalize (function) Normalize the matrix prior to feature selection by calling normalize(mat) if it's not NULL. 
+#' For example, pass normalize_log() or normalize_tfidf(). 
+#' If the normalize function accepts a threads argument, that will passed as normalize(mat, threads=threads).
 #' @param threads (integer) Number of threads to use.
 #' @returns
 #' Return a dataframe with the following columns:
@@ -52,7 +52,7 @@
 #'     num_feats=2,
 #'     normalize=normalize_log(scale_factor=20)
 #' ) 
-#'
+#' @seealso `normalize_tfidf()` `normalize_log()`
 #' @export
 select_features_variance <- function(
   mat, num_feats = 0.05, 
@@ -139,8 +139,7 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread
 }
 
 #' @rdname feature_selection
-#' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
-#' and if the number of features within a bin is less than 2, the dispersion is set to 1.
+#' @param n_bins (integer) Number of bins to split features into in order to control for the relationship between mean expression and dispersion (see details).
 #' @returns
 #'  - `select_features_binned_dispersion`: Process described in `details`.
 #' @details 
@@ -149,6 +148,8 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread
 #'  2. Calculate dispersion of each feature as `log(variance / mean)`
 #'  3. Z-score normalize dispersion within each bin, and select highest normalized dispersion across all bins
 #' 
+#' If the number of features within a bin is equal to 1, then dhe mean dispersion for that bin is set to 1.
+#' 
 #' This should be equivalent to `Seurat::FindVariableFeatures()` with `selection.method="mean.var.plot"`
 #'  and `scanpy.pp.highly_variable_genes()` with `flavor="seurat"`.
 #' @export
@@ -195,7 +196,8 @@ select_features_binned_dispersion <- function(
     ) %>%
     dplyr::ungroup() %>%
     dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats) %>% 
-    dplyr::select(c("names", "score", "highly_variable"))
+    dplyr::select(c("names", "dispersion", "bin",  "score", "highly_variable")) %>%
+    dplyr::rename("raw_log_dispersion" = "dispersion")
   return(features_df)
 }
 
@@ -257,7 +259,7 @@ project.DimReduction <- function(x, mat, ...) {
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
 #' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
 #' the corr_cutoff, it will be excluded from the final PCA matrix.
-#' @param scale_factor (numeric) Scale factor to use for tf-idf normalization.
+#' @param scale_factor (numeric) Scaling factor to multiply matrix by prior to log normalization (see formulas below).
 #' @param threads (integer) Number of threads to use.
 #' @returns An object of class `c("LSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data
diff --git a/r/R/transforms.R b/r/R/transforms.R
index 8b8ab7f5..a326955e 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -934,7 +934,7 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #' 
 #' @rdname normalize
 #' @param mat (IterableMatrix) Counts matrix to normalize. `(features x cells)`
-#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization.
+#' @param scale_factor (numeric) Scaling factor to multiply matrix by prior to normalization (see formulas below).
 #' @param threads (integer) Number of threads to use.
 #' @returns For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, 
 #'  transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as:
@@ -962,9 +962,9 @@ normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
 
 
 #' @rdname normalize
-#' @param feature_means (numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then 
-#' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
-#' Else, map each feature name to its mean value.
+#' @param feature_means (numeric, optional) Pre-calculated means of the features to normalize by (rowMeans(mat) by default). 
+#' If feature_means has names and mat has row names, match values by name. 
+#' Otherwise, assume feature_means has the same length and ordering as the matrix rows.
 #' @returns - `normalize_tfidf`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
 #' @details - `normalize_tfidf`: This follows the formula from Stuart, Butler et al. 2019, matching the default behavior of `Signac::RunTFIDF()`. This also matches the normalization used within `ArchR::addIterativeLSI()`, but with `binarize = FALSE`. 
 #' @export
diff --git a/r/man/lsi.Rd b/r/man/LSI.Rd
similarity index 94%
rename from r/man/lsi.Rd
rename to r/man/LSI.Rd
index 2b544b8d..8d719c00 100644
--- a/r/man/lsi.Rd
+++ b/r/man/LSI.Rd
@@ -21,7 +21,7 @@ LSI(
 \item{corr_cutoff}{(numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
 the corr_cutoff, it will be excluded from the final PCA matrix.}
 
-\item{scale_factor}{(numeric) Scale factor to use for tf-idf normalization.}
+\item{scale_factor}{(numeric) Scaling factor to multiply matrix by prior to log normalization (see formulas below).}
 
 \item{threads}{(integer) Number of threads to use.}
 }
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index 83d0f27d..f1c63234 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -42,18 +42,15 @@ select_features_binned_dispersion(
 \arguments{
 \item{mat}{(IterableMatrix) dimensions features x cells}
 
-\item{num_feats}{(float) Number of features to return.  If the number given is between 0 and 1, treat as a proportion of
-the number of rows, rounded down.  Otherwise, treat as an absolute number.
-If the number is higher than the number of features in the matrix,
-all features will be returned.}
+\item{num_feats}{(float) Number of features to mark as highly_variable. If 0 < num_feats < 1, then interpret it as a fraction of features.}
 
-\item{normalize}{(function) Normalize matrix using a given function. Normalization occurs on the input mat prior to feature
-selection. If \code{NULL}, no normalization is performed. @seealso \code{normalize_tfidf()} \code{normalize_log()}}
+\item{normalize}{(function) Normalize the matrix prior to feature selection by calling normalize(mat) if it's not NULL.
+For example, pass normalize_log() or normalize_tfidf().
+If the normalize function accepts a threads argument, that will passed as normalize(mat, threads=threads).}
 
 \item{threads}{(integer) Number of threads to use.}
 
-\item{n_bins}{(integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin,
-and if the number of features within a bin is less than 2, the dispersion is set to 1.}
+\item{n_bins}{(integer) Number of bins to split features into in order to control for the relationship between mean expression and dispersion (see details).}
 }
 \value{
 Return a dataframe with the following columns:
@@ -83,7 +80,9 @@ each feature \eqn{x_i} as follows:
 }
 }
 \description{
-Apply a feature selection method to a \verb{(features x cells)} matrix.
+Apply a feature selection method to a non-normalized \verb{(features x cells)} matrix.  We recommend using counts matrices as input and
+apply any normalizations prior to feature selection via the normalize argument (if available).  The output of these functions is a dataframe that has columns that
+at the minimum include the feature names and a score for each feature.
 }
 \details{
 \code{select_features_binned_dispersion} implements the approach from Satija et al. 2015:
@@ -93,6 +92,8 @@ Apply a feature selection method to a \verb{(features x cells)} matrix.
 \item Z-score normalize dispersion within each bin, and select highest normalized dispersion across all bins
 }
 
+If the number of features within a bin is equal to 1, then dhe mean dispersion for that bin is set to 1.
+
 This should be equivalent to \code{Seurat::FindVariableFeatures()} with \code{selection.method="mean.var.plot"}
 and \code{scanpy.pp.highly_variable_genes()} with \code{flavor="seurat"}.
 }
@@ -115,5 +116,7 @@ select_features_variance(
     num_feats=2,
     normalize=normalize_log(scale_factor=20)
 ) 
-
+}
+\seealso{
+\code{normalize_tfidf()} \code{normalize_log()}
 }

From 74e12b718e8276ef6b777b1100115938eeecf9dc Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 7 Feb 2025 13:32:33 -0800
Subject: [PATCH 067/142] [r] change matrix checks in feat selection,
 normalization, update normalization docs

---
 r/R/singlecell_utils.R | 17 +++++++----------
 r/R/transforms.R       |  6 ++----
 r/man/normalize.Rd     |  8 ++++----
 3 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index e6b0c6d7..2c1be1fd 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -64,8 +64,7 @@ select_features_variance <- function(
   assert_len(num_feats, 1)
   assert_is(num_feats, "numeric")
   if (rlang::is_missing(mat)) return(create_partial())
-  if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
-  assert_is(mat, "IterableMatrix")
+  assert_is_mat(mat)
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
   if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
     rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat))))
@@ -121,8 +120,7 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread
   assert_greater_than_zero(num_feats)
   assert_is(num_feats, "numeric")
   if (rlang::is_missing(mat)) return(create_partial())
-  if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
-  assert_is(mat, "IterableMatrix")
+  assert_is_mat(mat)
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
   if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
     rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat))))
@@ -163,8 +161,7 @@ select_features_binned_dispersion <- function(
   assert_len(n_bins, 1)
   assert_greater_than_zero(n_bins)
   if (rlang::is_missing(mat)) return(create_partial())
-  if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
-  assert_is(mat, "IterableMatrix")
+  assert_is_mat(mat)
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
   if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
     rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat))))
@@ -282,7 +279,7 @@ LSI <- function(
   if (rlang::is_missing(mat)) {
     return(create_partial())
   }
-  assert_is(mat, "IterableMatrix")
+  assert_is_mat(mat)
   assert_is_wholenumber(n_dimensions)
   assert_len(n_dimensions, 1)
   assert_greater_than_zero(n_dimensions)
@@ -334,7 +331,7 @@ LSI <- function(
 
 #' @export
 project.LSI <- function(x, mat, threads = 1L, ...) {
-  assert_is(mat, "IterableMatrix")
+  assert_is_mat(mat)
   assert_is(x, "LSI")
 
   fitted_params <- x$fitted_params
@@ -408,7 +405,7 @@ IterativeLSI <- function(
   cluster_method = cluster_graph_leiden,
   threads = 1L, verbose = FALSE
 ) {
-  assert_is(mat, "IterableMatrix")
+  assert_is_mat(mat)
   assert_true(n_iterations > 0)
   assert_is_wholenumber(n_iterations)
   assert_is_wholenumber(threads)
@@ -482,7 +479,7 @@ IterativeLSI <- function(
 }
 #' @export
 project.IterativeLSI <- function(x, mat, threads = 1L, ...) {
-  assert_is(mat, "IterableMatrix")
+  assert_is_mat(mat)
   fitted_params <- x$fitted_params
   # Get the final row of fitted params
   last_iter_info <- fitted_params$iter_info[nrow(fitted_params$iter_info),]
diff --git a/r/R/transforms.R b/r/R/transforms.R
index a326955e..62f1f9d5 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -953,8 +953,7 @@ normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
   assert_greater_than_zero(scale_factor)
   assert_is_wholenumber(threads)
   if (rlang::is_missing(mat)) return(create_partial())
-  if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
-  assert_is(mat, "IterableMatrix")
+  assert_is_mat(mat)
   read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
   mat <- mat %>% multiply_cols(1 / read_depth)
   return(log1p(mat * scale_factor))
@@ -975,8 +974,7 @@ normalize_tfidf <- function(
   assert_greater_than_zero(scale_factor)
   assert_is_wholenumber(threads)
   if (rlang::is_missing(mat)) return(create_partial())
-  if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
-  assert_is(mat, "IterableMatrix")
+  assert_is_mat(mat)
   # If feature means are passed in, only need to calculate term frequency
   if (is.null(feature_means)) {
     mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd
index f6467dca..4e55ab95 100644
--- a/r/man/normalize.Rd
+++ b/r/man/normalize.Rd
@@ -12,13 +12,13 @@ normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L)
 \arguments{
 \item{mat}{(IterableMatrix) Counts matrix to normalize. \verb{(features x cells)}}
 
-\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.}
+\item{scale_factor}{(numeric) Scaling factor to multiply matrix by prior to normalization (see formulas below).}
 
 \item{threads}{(integer) Number of threads to use.}
 
-\item{feature_means}{(numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then
-each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
-Else, map each feature name to its mean value.}
+\item{feature_means}{(numeric, optional) Pre-calculated means of the features to normalize by (rowMeans(mat) by default).
+If feature_means has names and mat has row names, match values by name.
+Otherwise, assume feature_means has the same length and ordering as the matrix rows.}
 }
 \value{
 For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,

From c8b73a2039d2a41a8529393254ccca2017036df5 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 7 Feb 2025 14:12:04 -0800
Subject: [PATCH 068/142] [r] add docs changes to DimReduction, allow for knn
 parameterization in IterativeLSI

---
 r/R/clustering.R       |  2 ++
 r/R/singlecell_utils.R | 35 ++++++++++++++++++++++-------------
 r/man/DimReduction.Rd  |  6 ++++--
 r/man/IterativeLSI.Rd  | 15 +++++++++++----
 r/man/LSI.Rd           |  5 +++--
 r/man/project.Rd       |  2 +-
 r/pkgdown/_pkgdown.yml |  5 ++---
 7 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/r/R/clustering.R b/r/R/clustering.R
index b53a4edf..bffa7eb8 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -238,6 +238,7 @@ cluster_membership_matrix <- function(groups, group_order = NULL) {
 #' @export
 knn_hnsw <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine"), verbose = TRUE, threads = 1, ef = 100) {
   metric <- match.arg(metric)
+  if (rlang::is_missing(data)) return(create_partial())
   index <- RcppHNSW::hnsw_build(
     data,
     distance = metric,
@@ -273,6 +274,7 @@ knn_hnsw <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine
 #' @export
 knn_annoy <- function(data, query = data, k = 10, metric = c("euclidean", "cosine", "manhattan", "hamming"), n_trees = 50, search_k = -1) {
   metric <- match.arg(metric)
+  if (rlang::is_missing(data)) return(create_partial())
   annoy <- switch(metric,
     "euclidean" = new(RcppAnnoy::AnnoyEuclidean, ncol(data)),
     "cosine" = new(RcppAnnoy::AnnoyAngular, ncol(data)),
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 2c1be1fd..93f3acf1 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -208,8 +208,9 @@ select_features_binned_dispersion <- function(
 #' Represents a latent space output of a matrix after a transformation function, with any required information to reproject other inputs using this object.
 #' Child classes should implement a `project` method to allow for the projection of other matrices using
 #' the fitted transformation object.
-#' @field cell_embeddings (IterableMatrix, dgCMatrix, matrix) The projected data
-#' @field fitted_params (list) A list of parameters used for the transformation of a matrix.
+#' @field cell_embeddings (IterableMatrix, dgCMatrix, matrix) Projected data of shape `(n_dimesions x n_cells)` of the original matrix after a dimensionality reduction.
+#' @field fitted_params (list) A list of parameters used for the transformation of a matrix.  This should include all necessary information to project new data with the same features.
+#' @field feature_names (character) The names of the features that this DimReduction object was fit on.  Matrices to be projected should have the same feature names.
 #' @export
 DimReduction <- function(x, fitted_params = list(), ...) {
   assert_is(x, c("IterableMatrix", "dgCMatrix", "matrix"))
@@ -228,7 +229,7 @@ DimReduction <- function(x, fitted_params = list(), ...) {
 #' @param mat IterableMatrix object.
 #' @return IterableMatrix object of the projected data.
 #' @details DimReduction subclasses should use this to project new data with the same features, to project into the same latent space.
-#' All required information to run a projection should be held in x$fitted_params, including pertinent parameters when construction the DimReduction subclass object.
+#' All required information to run a projection should be held in x$fitted_params, including pertinent parameters when constructing the DimReduction subclass object.
 #' If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features.
 #' If there are rownames, reorder the matrix to match the order of the original matrix
 #' @export
@@ -251,8 +252,9 @@ project.DimReduction <- function(x, mat, ...) {
 
 #' Perform latent semantic indexing (LSI) on a matrix.
 #' 
-#' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
-#' @param mat (IterableMatrix) dimensions features x cells.
+#' Given a `(features x cells)` counts matrix, perform LSI which sequentially executes tf-idf normalization and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
+#' Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space.
+#' @param mat (IterableMatrix) Counts matrix of shape `(features x cells)`.
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
 #' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
 #' the corr_cutoff, it will be excluded from the final PCA matrix.
@@ -361,12 +363,16 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 
 #' Run iterative LSI on a matrix.
 #' 
-#' Given a `(features x cells)` matrix, Compute an iterative LSI dimensionality reduction, using the method described in [ArchR](https://doi.org/10.1038/s41588-021-00790-6) (Granja et al; 2019).
-#' @param mat (IterableMatrix) 
+#' Given a `(features x cells)` counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.  This uses the method described in [ArchR](https://doi.org/10.1038/s41588-021-00790-6) (Granja et al; 2019). 
+#' See details for more specific information.  Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space.
+#'
+#' @param mat (IterableMatrix) Counts matrix of shape `(features x cells)`.
 #' @param n_iterations (int) The number of LSI iterations to perform.
 #' @param first_feature_selection_method (function) Method to use for selecting features for the first iteration. Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_mean`, `select_features_binned_dispersion`
 #' @param feature_selection_method (function) Method to use for selecting features for each iteration after the first. Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_mean`, `select_features_binned_dispersion`
-#' @param cluster_method (function) Method to use for clustering. Current builtin options are `cluster_graph_{leiden, louvain, seurat}()`
+#' @param knn_method (function) Method to use for obtaining a kNN matrix for determining clusters assignments of cells.  Current builtin options are `knn_hnsw()` and `knn_annoy()`.  The 
+#' user can pass in partial parameters to the knn method, such as by passing `knn_hnsw(ef = 500, k = 12)`
+#' @param cluster_method (function) Method to use for clustering a kNN matrix. Current builtin options are `cluster_graph_{leiden, louvain, seurat}()`
 #' @param lsi_method (function) Method to use for LSI.  Only `LSI` is allowed.  The user can pass in partial parameters to `LSI` to customize the LSI method, such as by passing `LSI(n_dimensions = 30, corr_cutoff = 0.5)`.
 #' @return An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data
@@ -393,7 +399,9 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #'    - Perform LSI on the selected features
 #'    - If this is the final iteration, return the PCA results
 #'    - Else, cluster the LSI results using `cluster_method`
-#' @seealso `LSI()`, `feature_selection`, `DimReduction()`
+#' @seealso `LSI()` `DimReduction()` `knn_hnsw()` `knn_annoy()` 
+#' `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` `select_features_variance()` `select_features_dispersion()` 
+#' `select_features_mean()` `select_features_binned_dispersion()`
 #' @inheritParams LSI
 #' @export
 IterativeLSI <- function(
@@ -401,7 +409,8 @@ IterativeLSI <- function(
   n_iterations = 2,
   first_feature_selection_method = select_features_binned_dispersion,
   feature_selection_method = select_features_dispersion,
-  lsi_method = LSI, # Make only allowed to be LSI
+  lsi_method = LSI,
+  knn_method = knn_hnsw(ef = 500),
   cluster_method = cluster_graph_leiden,
   threads = 1L, verbose = FALSE
 ) {
@@ -431,9 +440,9 @@ IterativeLSI <- function(
     # run variable feature selection
     if (verbose) log_progress("Selecting features")
     if (i == 1) {
-      variable_features <- first_feature_selection_method(mat, threads = threads)
+      variable_features <- partial_apply(first_feature_selection_method, threads = threads)(mat)
     } else {
-      variable_features <- feature_selection_method(pseudobulk_res, threads = threads)
+      variable_features <- partial_apply(feature_selection_method, threads = threads)(pseudobulk_res)
     }
     fitted_params$iter_info$feature_names[[i]] <- variable_features %>% dplyr::filter(highly_variable) %>% dplyr::pull(names)
     
@@ -457,7 +466,7 @@ IterativeLSI <- function(
     if (i == n_iterations) break 
     # cluster the LSI results
     if (verbose) log_progress("Clustering LSI results")
-    clustering_res <- t(lsi_res_obj$cell_embeddings) %>% knn_hnsw(ef = 500, threads = threads) %>% knn_to_snn_graph() %>% cluster_method()
+    clustering_res <- t(lsi_res_obj$cell_embeddings) %>% partial_apply(knn_method, threads = threads)()  %>% knn_to_snn_graph() %>% cluster_method()
     fitted_params$iter_info$clusters[[i]] <- clustering_res
     # pseudobulk and pass onto next iteration
     if (verbose) log_progress("Pseudobulking matrix")
diff --git a/r/man/DimReduction.Rd b/r/man/DimReduction.Rd
index 2bb7486c..acb74eac 100644
--- a/r/man/DimReduction.Rd
+++ b/r/man/DimReduction.Rd
@@ -14,8 +14,10 @@ the fitted transformation object.
 \section{Fields}{
 
 \describe{
-\item{\code{cell_embeddings}}{(IterableMatrix, dgCMatrix, matrix) The projected data}
+\item{\code{cell_embeddings}}{(IterableMatrix, dgCMatrix, matrix) Projected data of shape \verb{(n_dimesions x n_cells)} of the original matrix after a dimensionality reduction.}
 
-\item{\code{fitted_params}}{(list) A list of parameters used for the transformation of a matrix.}
+\item{\code{fitted_params}}{(list) A list of parameters used for the transformation of a matrix.  This should include all necessary information to project new data with the same features.}
+
+\item{\code{feature_names}}{(character) The names of the features that this DimReduction object was fit on.  Matrices to be projected should have the same feature names.}
 }}
 
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index 99511206..ff8eb5d6 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -10,13 +10,14 @@ IterativeLSI(
   first_feature_selection_method = select_features_binned_dispersion,
   feature_selection_method = select_features_dispersion,
   lsi_method = LSI,
+  knn_method = knn_hnsw(ef = 500),
   cluster_method = cluster_graph_leiden,
   threads = 1L,
   verbose = FALSE
 )
 }
 \arguments{
-\item{mat}{(IterableMatrix)}
+\item{mat}{(IterableMatrix) Counts matrix of shape \verb{(features x cells)}.}
 
 \item{n_iterations}{(int) The number of LSI iterations to perform.}
 
@@ -26,7 +27,10 @@ IterativeLSI(
 
 \item{lsi_method}{(function) Method to use for LSI.  Only \code{LSI} is allowed.  The user can pass in partial parameters to \code{LSI} to customize the LSI method, such as by passing \code{LSI(n_dimensions = 30, corr_cutoff = 0.5)}.}
 
-\item{cluster_method}{(function) Method to use for clustering. Current builtin options are \verb{cluster_graph_\{leiden, louvain, seurat\}()}}
+\item{knn_method}{(function) Method to use for obtaining a kNN matrix for determining clusters assignments of cells.  Current builtin options are \code{knn_hnsw()} and \code{knn_annoy()}.  The
+user can pass in partial parameters to the knn method, such as by passing \code{knn_hnsw(ef = 500, k = 12)}}
+
+\item{cluster_method}{(function) Method to use for clustering a kNN matrix. Current builtin options are \verb{cluster_graph_\{leiden, louvain, seurat\}()}}
 
 \item{threads}{(integer) Number of threads to use.}
 }
@@ -50,7 +54,8 @@ An object of class \code{c("IterativeLSI", "DimReduction")} with the following a
 }
 }
 \description{
-Given a \verb{(features x cells)} matrix, Compute an iterative LSI dimensionality reduction, using the method described in \href{https://doi.org/10.1038/s41588-021-00790-6}{ArchR} (Granja et al; 2019).
+Given a \verb{(features x cells)} counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.  This uses the method described in \href{https://doi.org/10.1038/s41588-021-00790-6}{ArchR} (Granja et al; 2019).
+See details for more specific information.  Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space.
 }
 \details{
 The iterative LSI method is as follows:
@@ -72,5 +77,7 @@ The iterative LSI method is as follows:
 }
 }
 \seealso{
-\code{LSI()}, \code{feature_selection}, \code{DimReduction()}
+\code{LSI()} \code{DimReduction()} \code{knn_hnsw()} \code{knn_annoy()}
+\code{cluster_graph_leiden()} \code{cluster_graph_louvain()} \code{cluster_graph_seurat()} \code{select_features_variance()} \code{select_features_dispersion()}
+\code{select_features_mean()} \code{select_features_binned_dispersion()}
 }
diff --git a/r/man/LSI.Rd b/r/man/LSI.Rd
index 8d719c00..af9bdd52 100644
--- a/r/man/LSI.Rd
+++ b/r/man/LSI.Rd
@@ -14,7 +14,7 @@ LSI(
 )
 }
 \arguments{
-\item{mat}{(IterableMatrix) dimensions features x cells.}
+\item{mat}{(IterableMatrix) Counts matrix of shape \verb{(features x cells)}.}
 
 \item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
 
@@ -40,7 +40,8 @@ An object of class \code{c("LSI", "DimReduction")} with the following attributes
 }
 }
 \description{
-Given a \verb{(features x cells)} matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
+Given a \verb{(features x cells)} counts matrix, perform LSI which sequentially executes tf-idf normalization and PCA to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
+Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space.
 }
 \details{
 Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
diff --git a/r/man/project.Rd b/r/man/project.Rd
index dcab31b5..696294d5 100644
--- a/r/man/project.Rd
+++ b/r/man/project.Rd
@@ -19,7 +19,7 @@ Perform a dimensionality reduction on a matrix using a pre-fit DimReduction obje
 }
 \details{
 DimReduction subclasses should use this to project new data with the same features, to project into the same latent space.
-All required information to run a projection should be held in x$fitted_params, including pertinent parameters when construction the DimReduction subclass object.
+All required information to run a projection should be held in x$fitted_params, including pertinent parameters when constructing the DimReduction subclass object.
 If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features.
 If there are rownames, reorder the matrix to match the order of the original matrix
 }
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index d3d9a0c1..368760c9 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -136,7 +136,7 @@ reference:
   - pseudobulk_matrix
 
 - title: "Single-cell analysis helpers"
-- subtitle: "Dimensionality reduction"
+- subtitle: "Normalizations and Feature Selection"
 - contents:
   - normalize_log
   - select_features_variance
@@ -146,8 +146,7 @@ reference:
   - cluster_graph_leiden
   - knn_to_graph
   - cluster_membership_matrix
-
-- title: "Dimensionality Reductions"
+- subtitle: "Dimensionality Reductions"
 - contents: 
   - DimReduction
   - project

From a3dd8a3ca062beb42031975e34be32bc6dc2b220 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 7 Feb 2025 18:21:24 -0800
Subject: [PATCH 069/142] [r] change feature selection column name from names
 to feature

---
 r/R/singlecell_utils.R                   | 16 ++++++++--------
 r/man/feature_selection.Rd               |  2 +-
 r/tests/testthat/test-singlecell_utils.R | 10 +++++-----
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 93f3acf1..56e1835b 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -1,4 +1,4 @@
-# Copyright 2023 BPCells contributors
+# Copyright 2025 BPCells contributors
 # 
 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 # https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
@@ -25,7 +25,7 @@
 #' @param threads (integer) Number of threads to use.
 #' @returns
 #' Return a dataframe with the following columns:
-#' - `names`: Feature name.
+#' - `feature`: Feature name.
 #' - `score`: Scoring of the feature, depending on the method used.  
 #' - `highly_variable`: Logical vector of whether the feature is highly variable.
 #'
@@ -72,7 +72,7 @@ select_features_variance <- function(
   num_feats <- min(max(num_feats, 0), nrow(mat))
   if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads, verbose = verbose)(mat)
   features_df <- tibble::tibble(
-    names = rownames(mat),
+    feature = rownames(mat),
     score = matrix_stats(mat, row_stats = "variance", threads = threads)$row_stats["variance",]
   ) %>% 
     dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats)
@@ -104,7 +104,7 @@ select_features_dispersion <- function(
   if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads, verbose = verbose)(mat)
   mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads)
   features_df <- tibble::tibble(
-    names = rownames(mat),
+    feature = rownames(mat),
     score = mat_stats$row_stats["variance", ] / mat_stats$row_stats["mean", ]
   ) %>% 
     dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats)
@@ -129,7 +129,7 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread
   if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads, verbose = verbose)(mat)
   # get the sum of each feature, binarized
   features_df <- tibble::tibble(
-    names = rownames(mat),
+    feature = rownames(mat),
     score = matrix_stats(mat, row_stats = "nonzero", threads = threads)$row_stats["nonzero", ]
   ) %>%
     dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats)
@@ -178,7 +178,7 @@ select_features_binned_dispersion <- function(
   feature_dispersion[feature_means == 0] <- 0
   feature_means <- log1p(feature_means)
   features_df <- tibble::tibble(
-    names = names(feature_means),
+    feature = names(feature_means),
     var = feature_vars,
     mean = feature_means,
     dispersion = feature_dispersion
@@ -193,7 +193,7 @@ select_features_binned_dispersion <- function(
     ) %>%
     dplyr::ungroup() %>%
     dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats) %>% 
-    dplyr::select(c("names", "dispersion", "bin",  "score", "highly_variable")) %>%
+    dplyr::select(c("feature", "dispersion", "bin",  "score", "highly_variable")) %>%
     dplyr::rename("raw_log_dispersion" = "dispersion")
   return(features_df)
 }
@@ -444,7 +444,7 @@ IterativeLSI <- function(
     } else {
       variable_features <- partial_apply(feature_selection_method, threads = threads)(pseudobulk_res)
     }
-    fitted_params$iter_info$feature_names[[i]] <- variable_features %>% dplyr::filter(highly_variable) %>% dplyr::pull(names)
+    fitted_params$iter_info$feature_names[[i]] <- variable_features %>% dplyr::filter(highly_variable) %>% dplyr::pull(feature)
     
     if (is.character(fitted_params$iter_info$feature_names[[i]])) {
       mat_indices <- which(rownames(mat) %in% fitted_params$iter_info$feature_names[[i]])
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index f1c63234..71a2a6f9 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -55,7 +55,7 @@ If the normalize function accepts a threads argument, that will passed as normal
 \value{
 Return a dataframe with the following columns:
 \itemize{
-\item \code{names}: Feature name.
+\item \code{feature}: Feature name.
 \item \code{score}: Scoring of the feature, depending on the method used.
 \item \code{highly_variable}: Logical vector of whether the feature is highly variable.
 }
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 27e57055..bed49005 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -1,4 +1,4 @@
-# Copyright 2023 BPCells contributors
+# Copyright 2025 BPCells contributors
 # 
 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 # https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
@@ -27,7 +27,7 @@ test_that("select_features works general case", {
     res <- do.call(fn, list(m1, num_feats = 5))
     expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting
     expect_equal(sum(res$highly_variable), 5) # Only 10 features marked as highly variable
-    expect_setequal(res$names, rownames(m1))
+    expect_setequal(res$feature, rownames(m1))
     res_more_feats_than_rows <- suppressWarnings(do.call(fn, list(m1, num_feats = 10000))) # more features than rows
     res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100))
     res_feats_partial <- get(fn)(num_feats = 100)(m1)
@@ -40,7 +40,7 @@ test_that("select_features works general case", {
       res_norm_partial <- do.call(fn, list(m1, num_feats = 10, normalize = normalize_log(scale = 1e3, threads = 1L)))
       res_norm_implicit_partial <- select_features_variance(normalize = normalize_log(scale_factor = 1e3), num_feats = 10)(m1)
       expect_identical(res_norm_partial, res_norm_implicit_partial)
-      expect_true(!all((res_no_norm %>% dplyr::arrange(names))$score == (res_norm_partial %>% dplyr::arrange(names))$score))
+      expect_true(!all((res_no_norm %>% dplyr::arrange(feature))$score == (res_norm_partial %>% dplyr::arrange(feature))$score))
     }
   }
 })
@@ -203,9 +203,9 @@ test_that("Feature selection by bin variance works", {
     # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
     res_table <- select_features_binned_dispersion(mat, num_feats = 10, n_bins = 5, threads = 1)
     res_table_t <- select_features_binned_dispersion(t(mat), num_feats = 10, n_bins = 5, threads = 1)
-    res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) 
+    res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(feature) 
     res <- mat[res_feats,]
-    res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(names)
+    res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(feature)
     res_t <- t(mat[,res_feats_t])
     
     expect_equal(nrow(res), 10)

From edd4dfeceec4315bd9de15bc7a4f7931c9471d83 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 7 Feb 2025 19:12:11 -0800
Subject: [PATCH 070/142] [r] change iterative LSI defaults

---
 r/NAMESPACE            |  1 -
 r/R/singlecell_utils.R | 33 ++++++++++++++++-----------------
 r/man/IterativeLSI.Rd  | 15 ++++++++-------
 3 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 4f14443e..9cfed020 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -2,7 +2,6 @@
 
 S3method(base::as.data.frame,IterableFragments)
 S3method(base::as.matrix,IterableMatrix)
-S3method(project,DimReduction)
 S3method(project,IterativeLSI)
 S3method(project,LSI)
 S3method(project,default)
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 56e1835b..2ba3810c 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -238,11 +238,7 @@ project <- function(x, mat, ...) {
 }
 #' @export 
 project.default <- function(x, mat, ...) {
-  rlang::abort("project method not implemented for BPCells objects.")
-}
-#' @export
-project.DimReduction <- function(x, mat, ...) {
-  rlang::abort("project method not implemented for base DimReduction object.")
+  rlang::abort("project method not implemented for objects that are not a fitted DimReduction")
 }
 
 #################
@@ -346,6 +342,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
   mat <- partial_apply(
     normalize_tfidf,
     feature_means = fitted_params$feature_means,
+    scale_factor = fitted_params$scale_factor,
     threads = threads
   )(mat)
   mat <- write_matrix_dir(
@@ -377,12 +374,13 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @return An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data
 #' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
-#' - `first_feature_selection_method`: The method used for selecting features for the first iteration
-#' - `lsi_method`: The method used for LSI
-#' - `cluster_method`: The method used for clustering
-#' - `feature_means`: The means of the features used for normalization
-#' - `iterations`: The number of iterations
-#' - `iter_info`: A tibble with the following columns:
+#'  - `first_feature_selection_method`: The method used for selecting features for the first iteration
+#'  - `lsi_method`: The method used for LSI
+#'  - `knn_method`: The method used for obtaining a kNN matrix
+#'  - `cluster_method`: The method used for clustering
+#'  - `feature_means`: The means of the features used for tf-idf normalization
+#'  - `iterations`: The number of LSI iterations ran
+#'  - `iter_info`: A tibble with the following columns:
 #'    - `iteration`: The iteration number
 #'    - `feature_names`: The names of the features used for the iteration
 #'    - `lsi_results`: The results of LSI for the iteration.  This follows the same structure as the `fitted_params` attribute of the `LSI` object, but information such as the `v` and `d` matrices are removed.
@@ -392,13 +390,13 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' - First iteration:
 #'    - Select features based on the `first_feature_selection_method` argument
 #'    - Perform LSI on the selected features
-#'    - If `n_iterations` is 1, return the PCA results
-#'    - Else, cluster the LSI results using `cluster_method`
+#'    - If `n_iterations` is 1, return the projected data from the first PCA projection
+#'    - Else, turn the LSI results into a kNN matrix using `knn_method`, then cluster the kNN matrix using `cluster_method`
 #' - For each subsequent iteration:
 #'    - Pseudobulk the clusters and select the top features based on the variance of the pseudobulked clusters
 #'    - Perform LSI on the selected features
-#'    - If this is the final iteration, return the PCA results
-#'    - Else, cluster the LSI results using `cluster_method`
+#'    - If this is the final iteration, return the projected data from this PCA projection
+#'    - Else, turn the LSI results into a kNN matrix using `knn_method`, then cluster the kNN matrix using `cluster_method`
 #' @seealso `LSI()` `DimReduction()` `knn_hnsw()` `knn_annoy()` 
 #' `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` `select_features_variance()` `select_features_dispersion()` 
 #' `select_features_mean()` `select_features_binned_dispersion()`
@@ -410,7 +408,7 @@ IterativeLSI <- function(
   first_feature_selection_method = select_features_binned_dispersion,
   feature_selection_method = select_features_dispersion,
   lsi_method = LSI,
-  knn_method = knn_hnsw(ef = 500),
+  knn_method = knn_hnsw,
   cluster_method = cluster_graph_leiden,
   threads = 1L, verbose = FALSE
 ) {
@@ -422,6 +420,7 @@ IterativeLSI <- function(
   fitted_params = list(
     first_feature_selection_method = first_feature_selection_method,
     lsi_method = lsi_method,
+    knn_method = knn_method,
     cluster_method = cluster_method,
     feature_means = matrix_stats(mat, row_stats = "mean", threads = threads)$row_stats["mean",],
     iterations = n_iterations,
@@ -477,7 +476,7 @@ IterativeLSI <- function(
     )
     rownames(pseudobulk_res) <- rownames(mat)
   }
-  if (verbose) log_progress("Finished running LSI")
+  if (verbose) log_progress("Finished running Iterative LSI")
   res <- DimReduction(
     x = lsi_res_obj$cell_embeddings,
     fitted_params = fitted_params,
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index ff8eb5d6..ac26779b 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -10,7 +10,7 @@ IterativeLSI(
   first_feature_selection_method = select_features_binned_dispersion,
   feature_selection_method = select_features_dispersion,
   lsi_method = LSI,
-  knn_method = knn_hnsw(ef = 500),
+  knn_method = knn_hnsw,
   cluster_method = cluster_graph_leiden,
   threads = 1L,
   verbose = FALSE
@@ -41,9 +41,10 @@ An object of class \code{c("IterativeLSI", "DimReduction")} with the following a
 \item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 \item \code{first_feature_selection_method}: The method used for selecting features for the first iteration
 \item \code{lsi_method}: The method used for LSI
+\item \code{knn_method}: The method used for obtaining a kNN matrix
 \item \code{cluster_method}: The method used for clustering
-\item \code{feature_means}: The means of the features used for normalization
-\item \code{iterations}: The number of iterations
+\item \code{feature_means}: The means of the features used for tf-idf normalization
+\item \code{iterations}: The number of LSI iterations ran
 \item \code{iter_info}: A tibble with the following columns:
 \itemize{
 \item \code{iteration}: The iteration number
@@ -64,15 +65,15 @@ The iterative LSI method is as follows:
 \itemize{
 \item Select features based on the \code{first_feature_selection_method} argument
 \item Perform LSI on the selected features
-\item If \code{n_iterations} is 1, return the PCA results
-\item Else, cluster the LSI results using \code{cluster_method}
+\item If \code{n_iterations} is 1, return the projected data from the first PCA projection
+\item Else, turn the LSI results into a kNN matrix using \code{knn_method}, then cluster the kNN matrix using \code{cluster_method}
 }
 \item For each subsequent iteration:
 \itemize{
 \item Pseudobulk the clusters and select the top features based on the variance of the pseudobulked clusters
 \item Perform LSI on the selected features
-\item If this is the final iteration, return the PCA results
-\item Else, cluster the LSI results using \code{cluster_method}
+\item If this is the final iteration, return the projected data from this PCA projection
+\item Else, turn the LSI results into a kNN matrix using \code{knn_method}, then cluster the kNN matrix using \code{cluster_method}
 }
 }
 }

From ff8fb5841eed730146c8373c021acbd5b68996ca Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Sun, 9 Feb 2025 20:35:08 -0800
Subject: [PATCH 071/142] [r] update `select_features_mean()` to be caleld
 `select_features_accessibility()`

---
 r/NAMESPACE                              |  2 +-
 r/NEWS.md                                |  2 +-
 r/R/singlecell_utils.R                   | 14 +++++++-------
 r/R/transforms.R                         |  2 +-
 r/man/feature_selection.Rd               |  6 +++---
 r/man/normalize.Rd                       |  2 +-
 r/tests/testthat/test-singlecell_utils.R |  2 +-
 7 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 9cfed020..60362432 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -117,9 +117,9 @@ export(rowVars.default)
 export(sctransform_pearson)
 export(select_cells)
 export(select_chromosomes)
+export(select_features_accessibility)
 export(select_features_binned_dispersion)
 export(select_features_dispersion)
-export(select_features_mean)
 export(select_features_variance)
 export(select_regions)
 export(set_trackplot_height)
diff --git a/r/NEWS.md b/r/NEWS.md
index 52b2e1fc..62ecb282 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -12,7 +12,7 @@ Contributions welcome :)
 - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166)
 - Add normalization helper functions `normalize_log()` and `normalize_tfidf()` (pull request #168)
 - Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #189)
-- Add feature selection functions `select_features_variance()`, `select_features_{variance,dispersion,mean,binned_dispersion}()`, with parameterization for normalization steps, and number of variable features (pull request #189)
+- Add feature selection functions `select_features_variance()`, and `select_features_{dispersion,accessibility,binned_dispersion}()`, with parameterization for normalization steps, and number of variable features (pull request #189)
 - Add `LSI()` and `IterativeLSI()` dimensionality functions to perform latent semantic indexing on a matrix (pull request #189).
 
 ## Improvements
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 2ba3810c..cc479d46 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -111,12 +111,12 @@ select_features_dispersion <- function(
   return(features_df)
 }
 
-
+#where \eqn{x_ij} = 1 & \text{if } x_{ij} == 0 \\ 0 & \text{otherwise} \end{cases}
 #' @rdname feature_selection
 #' @returns
-#' - `select_features_mean`: \eqn{\mathrm{Score}(x_i) = \bar{x}_i}
+#' - `select_features_accessibility`: \eqn{\mathrm{Score}(x_i) = \sum_{j=1}^{n} \bigl({x}_{ij}^{\mathrm{binarized}})\bigr)},  where \eqn{x_{ij}^{\mathrm{binarized}}} is defined as \eqn{1} if \eqn{x_{ij} != 0} and \eqn{0} otherwise.
 #' @export
-select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, threads = 1L, verbose = FALSE) {
+select_features_accessibility <- function(mat, num_feats = 0.05, normalize = NULL, threads = 1L, verbose = FALSE) {
   assert_greater_than_zero(num_feats)
   assert_is(num_feats, "numeric")
   if (rlang::is_missing(mat)) return(create_partial())
@@ -248,7 +248,7 @@ project.default <- function(x, mat, ...) {
 
 #' Perform latent semantic indexing (LSI) on a matrix.
 #' 
-#' Given a `(features x cells)` counts matrix, perform LSI which sequentially executes tf-idf normalization and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
+#' Given a `(features x cells)` counts matrix, perform LSI, which sequentially executes tf-idf normalization and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
 #' Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space.
 #' @param mat (IterableMatrix) Counts matrix of shape `(features x cells)`.
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
@@ -365,8 +365,8 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #'
 #' @param mat (IterableMatrix) Counts matrix of shape `(features x cells)`.
 #' @param n_iterations (int) The number of LSI iterations to perform.
-#' @param first_feature_selection_method (function) Method to use for selecting features for the first iteration. Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_mean`, `select_features_binned_dispersion`
-#' @param feature_selection_method (function) Method to use for selecting features for each iteration after the first. Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_mean`, `select_features_binned_dispersion`
+#' @param first_feature_selection_method (function) Method to use for selecting features for the first iteration. Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_accessibility`, `select_features_binned_dispersion`
+#' @param feature_selection_method (function) Method to use for selecting features for each iteration after the first. Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_accessibility`, `select_features_binned_dispersion`
 #' @param knn_method (function) Method to use for obtaining a kNN matrix for determining clusters assignments of cells.  Current builtin options are `knn_hnsw()` and `knn_annoy()`.  The 
 #' user can pass in partial parameters to the knn method, such as by passing `knn_hnsw(ef = 500, k = 12)`
 #' @param cluster_method (function) Method to use for clustering a kNN matrix. Current builtin options are `cluster_graph_{leiden, louvain, seurat}()`
@@ -399,7 +399,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #'    - Else, turn the LSI results into a kNN matrix using `knn_method`, then cluster the kNN matrix using `cluster_method`
 #' @seealso `LSI()` `DimReduction()` `knn_hnsw()` `knn_annoy()` 
 #' `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` `select_features_variance()` `select_features_dispersion()` 
-#' `select_features_mean()` `select_features_binned_dispersion()`
+#' `select_features_accessibility()` `select_features_binned_dispersion()`
 #' @inheritParams LSI
 #' @export
 IterativeLSI <- function(
diff --git a/r/R/transforms.R b/r/R/transforms.R
index 62f1f9d5..5ee851e0 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -944,7 +944,7 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #' **Passing to `normalize` parameters with non-default arguments**
 #' 
 #' If the `mat` argument is missing, returns a "partial" function: a copy of the original function but with most arguments pre-defined. 
-#' This can be used to customize `normalize` parameters in other single cell functions in BPCells (e.g. `select_features_mean()`).
+#' This can be used to customize `normalize` parameters in other single cell functions in BPCells (e.g. `select_features_accessibility()`).
 #' 
 #' **Related functions from other packages**
 #' - `normalize_log`: Corresponds to `Seurat::NormalizeData()` with its default "LogNormalize" method.
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index 71a2a6f9..d4a2bf8e 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -3,7 +3,7 @@
 \name{select_features_variance}
 \alias{select_features_variance}
 \alias{select_features_dispersion}
-\alias{select_features_mean}
+\alias{select_features_accessibility}
 \alias{select_features_binned_dispersion}
 \title{Feature selection functions}
 \usage{
@@ -23,7 +23,7 @@ select_features_dispersion(
   verbose = FALSE
 )
 
-select_features_mean(
+select_features_accessibility(
   mat,
   num_feats = 0.05,
   normalize = NULL,
@@ -72,7 +72,7 @@ each feature \eqn{x_i} as follows:
 }
 
 \itemize{
-\item \code{select_features_mean}: \eqn{\mathrm{Score}(x_i) = \bar{x}_i}
+\item \code{select_features_accessibility}: \eqn{\mathrm{Score}(x_i) = \sum_{j=1}^{n} \bigl({x}_{ij}^{\mathrm{binarized}})\bigr)},  where \eqn{x_{ij}^{\mathrm{binarized}}} is defined as \eqn{1} if \eqn{x_{ij} != 0} and \eqn{0} otherwise.
 }
 
 \itemize{
diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd
index 4e55ab95..65dd0759 100644
--- a/r/man/normalize.Rd
+++ b/r/man/normalize.Rd
@@ -38,7 +38,7 @@ Apply standard normalizations to a \verb{(features x cells)} counts matrix.
 \strong{Passing to \code{normalize} parameters with non-default arguments}
 
 If the \code{mat} argument is missing, returns a "partial" function: a copy of the original function but with most arguments pre-defined.
-This can be used to customize \code{normalize} parameters in other single cell functions in BPCells (e.g. \code{select_features_mean()}).
+This can be used to customize \code{normalize} parameters in other single cell functions in BPCells (e.g. \code{select_features_accessibility()}).
 
 \strong{Related functions from other packages}
 \itemize{
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index bed49005..19da7f19 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -23,7 +23,7 @@ generate_dense_matrix <- function(nrow, ncol) {
 
 test_that("select_features works general case", {
   m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix")
-  for (fn in c("select_features_variance", "select_features_dispersion", "select_features_mean")) {
+  for (fn in c("select_features_variance", "select_features_dispersion", "select_features_accessibility")) {
     res <- do.call(fn, list(m1, num_feats = 5))
     expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting
     expect_equal(sum(res$highly_variable), 5) # Only 10 features marked as highly variable

From 9c710650998dc8a1e33b4d7426316705fe5c238d Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Sun, 9 Feb 2025 22:28:10 -0800
Subject: [PATCH 072/142] [r] update mat assertion, LSI docs

---
 r/R/errorChecking.R    |  3 ++-
 r/R/singlecell_utils.R |  7 +++++++
 r/man/IterativeLSI.Rd  | 12 +++++++++---
 r/man/LSI.Rd           |  2 +-
 4 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/r/R/errorChecking.R b/r/R/errorChecking.R
index d65be9fb..a7866303 100644
--- a/r/R/errorChecking.R
+++ b/r/R/errorChecking.R
@@ -114,7 +114,8 @@ assert_is <- function(object, class, n = 1) {
   }
 }
 assert_is_mat <- function(object, n = 1) {
-  if (length(object) == 1) {
+  # matrices have length set to row*col instead of being 1, so we need to check dim as well
+  if (length(object) == 1 || !is.null(dim(object))) {
     if (!is(object, "IterableMatrix") && !canCoerce(mat, "IterableMatrix"))
       pretty_error(object, "must either be an IterableMatrix or coercible to an IterableMatrix", n)
   } else {
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index cc479d46..b201b19b 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -65,6 +65,7 @@ select_features_variance <- function(
   assert_is(num_feats, "numeric")
   if (rlang::is_missing(mat)) return(create_partial())
   assert_is_mat(mat)
+  
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
   if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
     rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat))))
@@ -397,6 +398,12 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #'    - Perform LSI on the selected features
 #'    - If this is the final iteration, return the projected data from this PCA projection
 #'    - Else, turn the LSI results into a kNN matrix using `knn_method`, then cluster the kNN matrix using `cluster_method`
+#' 
+#' There are some minor differences when compared to the ArchR implementation.  Firstly, the ArchR implementation uses a different method for selecting features in the first iteration.  ArchR utilizes top accessibility as the default, while this implementation uses binned dispersion as the default.
+#' `select_features_accessibility()` can be passed in for the `first_feature_selection_method` argument to mimic the ArchR implementation.
+#' 
+#' Secondly, the ArchR implementation calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
+#' which BPCells does not encounter even with a non-subsetted matrix.
 #' @seealso `LSI()` `DimReduction()` `knn_hnsw()` `knn_annoy()` 
 #' `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` `select_features_variance()` `select_features_dispersion()` 
 #' `select_features_accessibility()` `select_features_binned_dispersion()`
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index ac26779b..e45dd98b 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -21,9 +21,9 @@ IterativeLSI(
 
 \item{n_iterations}{(int) The number of LSI iterations to perform.}
 
-\item{first_feature_selection_method}{(function) Method to use for selecting features for the first iteration. Current builtin options are \code{select_features_variance}, \code{select_features_dispersion}, \code{select_features_mean}, \code{select_features_binned_dispersion}}
+\item{first_feature_selection_method}{(function) Method to use for selecting features for the first iteration. Current builtin options are \code{select_features_variance}, \code{select_features_dispersion}, \code{select_features_accessibility}, \code{select_features_binned_dispersion}}
 
-\item{feature_selection_method}{(function) Method to use for selecting features for each iteration after the first. Current builtin options are \code{select_features_variance}, \code{select_features_dispersion}, \code{select_features_mean}, \code{select_features_binned_dispersion}}
+\item{feature_selection_method}{(function) Method to use for selecting features for each iteration after the first. Current builtin options are \code{select_features_variance}, \code{select_features_dispersion}, \code{select_features_accessibility}, \code{select_features_binned_dispersion}}
 
 \item{lsi_method}{(function) Method to use for LSI.  Only \code{LSI} is allowed.  The user can pass in partial parameters to \code{LSI} to customize the LSI method, such as by passing \code{LSI(n_dimensions = 30, corr_cutoff = 0.5)}.}
 
@@ -76,9 +76,15 @@ The iterative LSI method is as follows:
 \item Else, turn the LSI results into a kNN matrix using \code{knn_method}, then cluster the kNN matrix using \code{cluster_method}
 }
 }
+
+There are some minor differences when compared to the ArchR implementation.  Firstly, the ArchR implementation uses a different method for selecting features in the first iteration.  ArchR utilizes top accessibility as the default, while this implementation uses binned dispersion as the default.
+\code{select_features_accessibility()} can be passed in for the \code{first_feature_selection_method} argument to mimic the ArchR implementation.
+
+Secondly, the ArchR implementation calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
+which BPCells does not encounter even with a non-subsetted matrix.
 }
 \seealso{
 \code{LSI()} \code{DimReduction()} \code{knn_hnsw()} \code{knn_annoy()}
 \code{cluster_graph_leiden()} \code{cluster_graph_louvain()} \code{cluster_graph_seurat()} \code{select_features_variance()} \code{select_features_dispersion()}
-\code{select_features_mean()} \code{select_features_binned_dispersion()}
+\code{select_features_accessibility()} \code{select_features_binned_dispersion()}
 }
diff --git a/r/man/LSI.Rd b/r/man/LSI.Rd
index af9bdd52..0b7c4ad0 100644
--- a/r/man/LSI.Rd
+++ b/r/man/LSI.Rd
@@ -40,7 +40,7 @@ An object of class \code{c("LSI", "DimReduction")} with the following attributes
 }
 }
 \description{
-Given a \verb{(features x cells)} counts matrix, perform LSI which sequentially executes tf-idf normalization and PCA to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
+Given a \verb{(features x cells)} counts matrix, perform LSI, which sequentially executes tf-idf normalization and PCA to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
 Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space.
 }
 \details{

From b6e5a8818033e78ec96025343deb251b12363ecc Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 10 Feb 2025 10:19:59 -0800
Subject: [PATCH 073/142] [r] remove verbose flag from feature selection

---
 r/R/singlecell_utils.R     | 16 +++++++---------
 r/man/feature_selection.Rd | 17 ++++-------------
 2 files changed, 11 insertions(+), 22 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index b201b19b..95d3429b 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -57,8 +57,7 @@
 select_features_variance <- function(
   mat, num_feats = 0.05, 
   normalize = NULL,
-  threads = 1L,
-  verbose = FALSE
+  threads = 1L
 ) {
   assert_greater_than_zero(num_feats)
   assert_len(num_feats, 1)
@@ -71,7 +70,7 @@ select_features_variance <- function(
     rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat))))
   }
   num_feats <- min(max(num_feats, 0), nrow(mat))
-  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads, verbose = verbose)(mat)
+  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
   features_df <- tibble::tibble(
     feature = rownames(mat),
     score = matrix_stats(mat, row_stats = "variance", threads = threads)$row_stats["variance",]
@@ -88,8 +87,7 @@ select_features_variance <- function(
 select_features_dispersion <- function(
   mat, num_feats = 0.05, 
   normalize = NULL,
-  threads = 1L,
-  verbose = FALSE
+  threads = 1L
 ) {
   assert_greater_than_zero(num_feats)
   assert_len(num_feats, 1)
@@ -102,7 +100,7 @@ select_features_dispersion <- function(
   num_feats <- min(max(num_feats, 0), nrow(mat))
   if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
   assert_is(mat, "IterableMatrix")
-  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads, verbose = verbose)(mat)
+  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
   mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads)
   features_df <- tibble::tibble(
     feature = rownames(mat),
@@ -117,7 +115,7 @@ select_features_dispersion <- function(
 #' @returns
 #' - `select_features_accessibility`: \eqn{\mathrm{Score}(x_i) = \sum_{j=1}^{n} \bigl({x}_{ij}^{\mathrm{binarized}})\bigr)},  where \eqn{x_{ij}^{\mathrm{binarized}}} is defined as \eqn{1} if \eqn{x_{ij} != 0} and \eqn{0} otherwise.
 #' @export
-select_features_accessibility <- function(mat, num_feats = 0.05, normalize = NULL, threads = 1L, verbose = FALSE) {
+select_features_accessibility <- function(mat, num_feats = 0.05, normalize = NULL, threads = 1L) {
   assert_greater_than_zero(num_feats)
   assert_is(num_feats, "numeric")
   if (rlang::is_missing(mat)) return(create_partial())
@@ -127,7 +125,7 @@ select_features_accessibility <- function(mat, num_feats = 0.05, normalize = NUL
     rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat))))
   }
   num_feats <- min(max(num_feats, 0), nrow(mat))
-  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads, verbose = verbose)(mat)
+  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
   # get the sum of each feature, binarized
   features_df <- tibble::tibble(
     feature = rownames(mat),
@@ -154,7 +152,7 @@ select_features_accessibility <- function(mat, num_feats = 0.05, normalize = NUL
 #' @export
 select_features_binned_dispersion <- function(
   mat, num_feats = 0.05, n_bins = 20,
-  threads = 1L, verbose = FALSE
+  threads = 1L
 ) {
   assert_greater_than_zero(num_feats)
   assert_len(num_feats, 1)
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index d4a2bf8e..61997713 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -7,36 +7,27 @@
 \alias{select_features_binned_dispersion}
 \title{Feature selection functions}
 \usage{
-select_features_variance(
-  mat,
-  num_feats = 0.05,
-  normalize = NULL,
-  threads = 1L,
-  verbose = FALSE
-)
+select_features_variance(mat, num_feats = 0.05, normalize = NULL, threads = 1L)
 
 select_features_dispersion(
   mat,
   num_feats = 0.05,
   normalize = NULL,
-  threads = 1L,
-  verbose = FALSE
+  threads = 1L
 )
 
 select_features_accessibility(
   mat,
   num_feats = 0.05,
   normalize = NULL,
-  threads = 1L,
-  verbose = FALSE
+  threads = 1L
 )
 
 select_features_binned_dispersion(
   mat,
   num_feats = 0.05,
   n_bins = 20,
-  threads = 1L,
-  verbose = FALSE
+  threads = 1L
 )
 }
 \arguments{

From 157e26edc279a41fb936160123ff5447efd0f4d3 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 10 Feb 2025 13:06:50 -0800
Subject: [PATCH 074/142] [r] change rcpphnsw pkg for imports to suggests

---
 r/DESCRIPTION                            | 6 +++---
 r/R/singlecell_utils.R                   | 1 +
 r/tests/testthat/test-singlecell_utils.R | 1 +
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index 7843c25d..f8fab3f6 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -43,13 +43,13 @@ Imports:
     scattermore,
     ggrepel,
     RColorBrewer,
-    hexbin,
-    RcppHNSW
+    hexbin
 Suggests:
     IRanges,
     GenomicRanges,
     matrixStats,
-    igraph
+    igraph,
+    RcppHNSW
 Depends: 
     R (>= 3.5.0)
 Config/Needs/website: pkgdown, devtools, uwot, irlba, RcppHNSW, igraph, BiocManager, bioc::BSgenome.Hsapiens.UCSC.hg38, github::GreenleafLab/motifmatchr, github::GreenleafLab/chromVARmotifs
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 95d3429b..8c63199e 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -417,6 +417,7 @@ IterativeLSI <- function(
   cluster_method = cluster_graph_leiden,
   threads = 1L, verbose = FALSE
 ) {
+  assert_has_package("RcppHNSW")
   assert_is_mat(mat)
   assert_true(n_iterations > 0)
   assert_is_wholenumber(n_iterations)
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 19da7f19..07677f19 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -234,6 +234,7 @@ test_that("LSI works", {
 })
 
 test_that("Iterative LSI works", {
+  skip_if_not_installed("RcppHNSW")
   mat <- matrix(data = runif(50000, 0, 1), nrow=500, ncol = 100) %>% as("dgCMatrix") %>% as("IterableMatrix")
   rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
   colnames(mat) <- paste0("cell", seq_len(ncol(mat)))

From 0917f266673380a49a4a7e71978851b63598856c Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 10 Feb 2025 20:22:26 -0800
Subject: [PATCH 075/142] [r] fix assert_is_mat, add tests

---
 r/R/errorChecking.R                   |  2 +-
 r/tests/testthat/test-errorChecking.R | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)
 create mode 100644 r/tests/testthat/test-errorChecking.R

diff --git a/r/R/errorChecking.R b/r/R/errorChecking.R
index a7866303..e466d9e4 100644
--- a/r/R/errorChecking.R
+++ b/r/R/errorChecking.R
@@ -116,7 +116,7 @@ assert_is <- function(object, class, n = 1) {
 assert_is_mat <- function(object, n = 1) {
   # matrices have length set to row*col instead of being 1, so we need to check dim as well
   if (length(object) == 1 || !is.null(dim(object))) {
-    if (!is(object, "IterableMatrix") && !canCoerce(mat, "IterableMatrix"))
+    if (!is(object, "IterableMatrix") && !canCoerce(object, "IterableMatrix"))
       pretty_error(object, "must either be an IterableMatrix or coercible to an IterableMatrix", n)
   } else {
     for (mat in object) {
diff --git a/r/tests/testthat/test-errorChecking.R b/r/tests/testthat/test-errorChecking.R
new file mode 100644
index 00000000..afcc6a01
--- /dev/null
+++ b/r/tests/testthat/test-errorChecking.R
@@ -0,0 +1,19 @@
+# Copyright 2025 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+test_that("assert_is_mat works", {
+    mat <- matrix(1:4, nrow = 2)
+    mat_dgc <- as(mat, "dgCMatrix")
+    mat_iterable <- as(mat, "IterableMatrix")
+    expect_no_error(assert_is_mat(mat))
+    expect_no_error(assert_is_mat(mat_dgc))
+    expect_no_error(assert_is_mat(c(mat_iterable, mat_iterable)))
+    expect_error(assert_is_mat("a"))
+    expect_error(assert_is_mat(c("a", "a")))
+    expect_error(assert_is_mat(1))
+})
\ No newline at end of file

From 82752eb06c1125e82cca8ceace9b6380a45c71e0 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 12 Feb 2025 14:38:09 -0800
Subject: [PATCH 076/142] [r] fix styling on `assert_is_mat()`

---
 r/R/errorChecking.R | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/r/R/errorChecking.R b/r/R/errorChecking.R
index e466d9e4..6ef4174d 100644
--- a/r/R/errorChecking.R
+++ b/r/R/errorChecking.R
@@ -116,12 +116,14 @@ assert_is <- function(object, class, n = 1) {
 assert_is_mat <- function(object, n = 1) {
   # matrices have length set to row*col instead of being 1, so we need to check dim as well
   if (length(object) == 1 || !is.null(dim(object))) {
-    if (!is(object, "IterableMatrix") && !canCoerce(object, "IterableMatrix"))
+    if (!is(object, "IterableMatrix") && !canCoerce(object, "IterableMatrix")) {
       pretty_error(object, "must either be an IterableMatrix or coercible to an IterableMatrix", n)
+    }
   } else {
     for (mat in object) {
-      if (!is(mat, "IterableMatrix") && !canCoerce(mat, "IterableMatrix"))
+      if (!is(mat, "IterableMatrix") && !canCoerce(mat, "IterableMatrix")) {
         pretty_error(mat, "must either be an IterableMatrix or coercible to an IterableMatrix", n)
+      }
     }
   }
 }

From 3a5a9d477381de0568d0a4812b17e03b65386482 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 12 Feb 2025 16:18:06 -0800
Subject: [PATCH 077/142] [r] add docs changes to normalize/binarize, give
 binarize partials

---
 r/R/transforms.R       | 35 ++++++++++++++++++++++++++++++-----
 r/man/binarize.Rd      | 15 +++++++++++++++
 r/man/normalize.Rd     | 12 +++++++++++-
 r/pkgdown/_pkgdown.yml |  2 +-
 4 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/r/R/transforms.R b/r/R/transforms.R
index 5ee851e0..6a32ccb3 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -294,16 +294,31 @@ setMethod("short_description", "TransformBinarize", function(x) {
 #'   comparison to the threshold is >= (strict_inequality=FALSE)
 #'   or > (strict_inequality=TRUE).
 #' @return binarized IterableMatrix object
+#' @description 
+#' If the `mat` argument is missing, returns a "partial" function: a copy of the original function but with most arguments pre-defined. 
+#' This can be used to customize `normalize` parameters in other single cell functions in BPCells (e.g. `select_features_mean()`).
+#' @examples
+#' set.seed(12345)
+#' mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
+#' mat
+#' mat <- as(mat, "IterableMatrix")
+#' mat_binarized <- binarize(mat, threshold=1)
+#' mat_binarized
+#' as(mat_binarized, "dgCMatrix")
+#' 
+#' # We can also call as a partialized function
+#' binarize(threshold = 0.1)(mat)
 #' @export
-binarize <- function(mat, threshold=0, strict_inequality=TRUE) {
-  assert_is(mat, "IterableMatrix")
+binarize <- function(mat, threshold = 0, strict_inequality = TRUE) {
   assert_is(threshold, "numeric")
   assert_len(threshold, 1)
   assert_is(strict_inequality, "logical")
-  if (strict_inequality == TRUE && threshold < 0)
+  if (strict_inequality && threshold < 0)
       stop("binarize threshold must be greater than or equal to zero when strict_inequality is TRUE")
-  if (strict_inequality == FALSE && threshold <= 0)
+  if (!strict_inequality && threshold <= 0)
      stop("binarize threshold must be greater than zero when strict_inequality is FALSE")
+  if (rlang::is_missing(mat)) return(create_partial())
+  assert_is(mat, "IterableMatrix")
 
   res <- wrapMatrix("TransformBinarize",
              convert_matrix_type(mat, "double"),
@@ -944,10 +959,18 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #' **Passing to `normalize` parameters with non-default arguments**
 #' 
 #' If the `mat` argument is missing, returns a "partial" function: a copy of the original function but with most arguments pre-defined. 
-#' This can be used to customize `normalize` parameters in other single cell functions in BPCells (e.g. `select_features_accessibility()`).
+#' This can be used to customize `normalize` parameters in other single cell functions in BPCells (e.g. `select_features_mean()`).
 #' 
 #' **Related functions from other packages**
 #' - `normalize_log`: Corresponds to `Seurat::NormalizeData()` with its default "LogNormalize" method.
+#' @examples
+#' set.seed(12345)
+#' mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
+#' mat
+#' mat <- as(mat, "IterableMatrix")
+#' normalize_log(mat)
+#' # normalize functions can also be called with partial arguments 
+#' normalize_log(scale_factor = 1e5)(mat)
 #' @export
 normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
   assert_greater_than_zero(scale_factor)
@@ -966,6 +989,8 @@ normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
 #' Otherwise, assume feature_means has the same length and ordering as the matrix rows.
 #' @returns - `normalize_tfidf`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
 #' @details - `normalize_tfidf`: This follows the formula from Stuart, Butler et al. 2019, matching the default behavior of `Signac::RunTFIDF()`. This also matches the normalization used within `ArchR::addIterativeLSI()`, but with `binarize = FALSE`. 
+#' @examples 
+#' normalize_tfidf(mat)
 #' @export
 normalize_tfidf <- function(
   mat, feature_means = NULL,
diff --git a/r/man/binarize.Rd b/r/man/binarize.Rd
index c162a8c8..a112c00c 100644
--- a/r/man/binarize.Rd
+++ b/r/man/binarize.Rd
@@ -27,4 +27,19 @@ are set to one; otherwise, set to zero. When strict_inequality
 is set to FALSE, element values greater than or equal to the
 threshold are set to one. As an alternative, the \code{<}, \code{<=}, \code{>},
 and \code{>=} operators are also supported.
+
+If the \code{mat} argument is missing, returns a "partial" function: a copy of the original function but with most arguments pre-defined.
+This can be used to customize \code{normalize} parameters in other single cell functions in BPCells (e.g. \code{select_features_mean()}).
+}
+\examples{
+set.seed(12345)
+mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
+mat
+mat <- as(mat, "IterableMatrix")
+mat_binarized <- binarize(mat, threshold=1)
+mat_binarized
+as(mat_binarized, "dgCMatrix")
+
+# We can also call as a partialized function
+binarize(threshold = 0.1)(mat)
 }
diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd
index 65dd0759..22e9218c 100644
--- a/r/man/normalize.Rd
+++ b/r/man/normalize.Rd
@@ -38,7 +38,7 @@ Apply standard normalizations to a \verb{(features x cells)} counts matrix.
 \strong{Passing to \code{normalize} parameters with non-default arguments}
 
 If the \code{mat} argument is missing, returns a "partial" function: a copy of the original function but with most arguments pre-defined.
-This can be used to customize \code{normalize} parameters in other single cell functions in BPCells (e.g. \code{select_features_accessibility()}).
+This can be used to customize \code{normalize} parameters in other single cell functions in BPCells (e.g. \code{select_features_mean()}).
 
 \strong{Related functions from other packages}
 \itemize{
@@ -49,3 +49,13 @@ This can be used to customize \code{normalize} parameters in other single cell f
 \item \code{normalize_tfidf}: This follows the formula from Stuart, Butler et al. 2019, matching the default behavior of \code{Signac::RunTFIDF()}. This also matches the normalization used within \code{ArchR::addIterativeLSI()}, but with \code{binarize = FALSE}.
 }
 }
+\examples{
+set.seed(12345)
+mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
+mat
+mat <- as(mat, "IterableMatrix")
+normalize_log(mat)
+# normalize functions can also be called with partial arguments 
+normalize_log(scale_factor = 1e5)(mat)
+normalize_tfidf(mat)
+}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 368760c9..0d21a1a0 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -136,7 +136,7 @@ reference:
   - pseudobulk_matrix
 
 - title: "Single-cell analysis helpers"
-- subtitle: "Normalizations and Feature Selection"
+- subtitle: "Normalization and Feature Selection"
 - contents:
   - normalize_log
   - select_features_variance

From f97cb518ff086fd2be5b765f1adc37611771f14e Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 12 Feb 2025 23:29:29 -0800
Subject: [PATCH 078/142] [r] update clustering to allow for single step
 processing

---
 r/R/clustering.R                       | 113 ++++++++++++++++++++++---
 r/man/cluster.Rd                       |  25 +++++-
 r/man/convert_mat_to_cluster_matrix.Rd |  43 ++++++++++
 r/man/knn_graph.Rd                     |  20 ++++-
 r/tests/testthat/test-clustering.R     |  13 +--
 5 files changed, 194 insertions(+), 20 deletions(-)
 create mode 100644 r/man/convert_mat_to_cluster_matrix.Rd

diff --git a/r/R/clustering.R b/r/R/clustering.R
index bffa7eb8..6cd6a81c 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -6,6 +6,46 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
+
+#' Convert a matrix to a required input type for a clustering step
+#' 
+#' Ensure that if a user is using a clustering function, the input matrix is converted to a knn or graph matrix.
+#' Returns the input matrix if it is already the correct type, otherwise converts it to the correct type.
+#' If the required matrix is an adjacency matrix, the input matrix will optionally be converted to a knn matrix if it is not one already.
+#' @param mat Input matrix to be converted
+#' @param required_mat_type (character) Type of matrix required for clustering. Must be one of "adjacency" or "knn".
+#' @param knn_mat_method (function) Function to convert input matrix to knn matrix. Must be a (optionally partialized) 
+#' version of `knn_hnsw()` or `knn_annoy()`.  Unused if mat is already a knn or graph matrix.
+#' @param knn_graph_method (function) Function to convert input matrix to knn graph. Must be a (optionally partialized)
+#' version of `knn_to_graph()`, `knn_to_snn_graph()` or `knn_to_geodesic_graph()`.  Unused if required_mat_type is "knn",
+#'  or if mat is already a graph matrix.
+#' @details 
+#' Clustering steps in BPCells use the following pattern:
+#' 1. The input matrix is converted to a knn matrix using a function named `knn_{hnsw, annoy}()`
+#' 2. The knn matrix is converted to a graph matrix using one of the functions named `knn_to_*_graph()`
+#' 3. The graph matrix is clustered using a function named `cluster_graph_*()`.
+#' 
+#' Each one of these steps requires a specific type of matrix as input.  
+#' The typing of the input matrices is determined by the `mat_type` attribute given by the knn_to_*_graph() and `knn_{hnsw, annoy}()` 
+#' functions,as returned data types vary between all clustering functions.
+#' @keywords internal
+convert_mat_to_cluster_matrix <- function(
+  mat,
+  required_mat_type = c("knn", "adjacency"), 
+  knn_mat_method = knn_annoy,
+  knn_graph_method = knn_to_geodesic_graph
+) {
+  required_mat_type <- match.arg(required_mat_type)
+  if (is(mat, "matrix") && is.null(attr(mat, "mat_type"))) {
+    mat <- knn_mat_method(mat)
+  }
+  if (required_mat_type == "adjacency" && attr(mat, "mat_type") == "knn") {
+    mat <- knn_graph_method(mat)
+  }
+  return(mat)
+}
+
+
 #' K Nearest Neighbor (KNN) Graph
 #'
 #' Convert a KNN object (e.g. returned by `knn_hnsw()` or `knn_annoy()`) into
@@ -18,10 +58,16 @@
 #'  dist for cell x K neighbor distances
 #' @param use_weights boolean for whether to replace all distance weights with 1
 #' @param self_loops boolean for whether to allow cells to count themselves as neighbors
+#' @param knn_mat_method (function) if knn is not a knn matrix, this function will attempt to convert it to one. 
+#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn matrix.
 #' @return **knn_to_graph**
 #'   Sparse matrix (dgCMatrix) where `mat[i,j]` = distance from cell `i` to
 #'   cell `j`, or 0 if cell `j` is not in the K nearest neighbors of `i`
-knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) {
+knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE, knn_mat_method = knn_annoy) {
+  assert_is(use_weights, "logical")
+  assert_is(self_loops, "logical")
+  if (rlang::is_missing(knn)) return(create_partial())
+  mat <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_mat_method = knn_mat_method)
   if (use_weights) {
     weights <- knn$dist
   } else {
@@ -38,10 +84,10 @@ knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) {
   }
   rownames(mat) <- rownames(knn$idx)
   colnames(mat) <- rownames(knn$idx)
+  attr(mat, "mat_type") <- "graph"
   mat
 }
 
-
 #' @rdname knn_graph
 #' @details **knn_to_snn_graph**
 #'  Convert a knn object into a shared nearest neighbors adjacency matrix.
@@ -61,8 +107,11 @@ knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) {
 #'      These correspond to the rows, cols, and values of non-zero entries in the lower triangle
 #'      adjacency matrix. `dim` is the total number of vertices (cells) in the graph 
 #' @export
-knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_type=c("matrix", "list")) {
+knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_type=c("matrix", "list"), knn_mat_method = knn_annoy) {
   return_type <- match.arg(return_type)
+  assert_is(self_loops, "logical")
+  if (rlang::is_missing(knn)) return(create_partial())
+  knn <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_mat_method = knn_mat_method)
   # Solve x / (2*K - x) >= min_val --> x >= 2*K*min_val / (1 + min_val)
   min_int <- ceiling(2*min_val*ncol(knn$idx) / (1 + min_val))
   snn <- build_snn_graph_cpp(knn$idx, min_neighbors = min_int)
@@ -83,10 +132,12 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t
   }
   
   # Return as a sparse matrix
-  Matrix::sparseMatrix(
+  res <- Matrix::sparseMatrix(
     i = snn$i + 1L, j = snn$j + 1L, x = snn$weight,
     dims = c(snn$dim, snn$dim)
   )
+  attr(res, "mat_type") <- "graph"
+  return(res)
 }
 
 #' @rdname knn_graph
@@ -111,20 +162,27 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t
 #'      These correspond to the rows, cols, and values of non-zero entries in the lower triangle
 #'      adjacency matrix. `dim` is the total number of vertices (cells) in the graph 
 #' @export
-knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=0L) {
+knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=0L, knn_mat_method = knn_annoy) {
   return_type <- match.arg(return_type)
+  assert_is_wholenumber(threads)
+  if (rlang::is_missing(knn)) return(create_partial())
+  knn_mat_method <- partial_apply(knn_mat_method, threads = threads)
+  knn <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_mat_method = knn_mat_method)
   graph <- build_umap_graph_cpp(knn$dist, knn$idx, threads=threads)
   
   graph$dim <- nrow(knn$idx)
   if (return_type == "list") {
+    attr(graph, "mat_type") <- "graph"
     return(graph)
   }
   
   # Return as a sparse matrix
-  Matrix::sparseMatrix(
+  res <- Matrix::sparseMatrix(
     i = graph$i + 1L, j = graph$j + 1L, x = graph$weight,
     dims = c(graph$dim, graph$dim)
   )
+  attr(res, "mat_type") <- "graph"
+  return(res)
 }
 
 #' Cluster an adjacency matrix
@@ -137,13 +195,26 @@ knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=
 #' @param resolution Resolution parameter. Higher values result in more clusters
 #' @param objective_function Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below). 
 #'    For the meaning of each option, see `igraph::cluster_leiden()`.
+#' @param knn_mat_method (function) if snn represents a regular non-knn matrix, this function will attempt to convert it a knn matrix.
+#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn matrix.
+#' @param knn_graph_method (function) if snn represents a knn matrix, this function will attempt to convert it to a graph matrix.
+#' Must be a (optionally partialized) version of `knn_to_graph()`, `knn_to_snn_graph()` or `knn_to_geodesic_graph()`.  
+#' Ignored if snn is already a graph matrix.
 #' @param seed Random seed for clustering initialization
 #' @param ... Additional arguments to underlying clustering function
 #' @return Factor vector containing the cluster assignment for each cell.
 #' @export
-cluster_graph_leiden <- function(snn, resolution = 1, objective_function = c("modularity", "CPM"), seed = 12531, ...) {
+cluster_graph_leiden <- function(
+  snn, resolution = 1, objective_function = c("modularity", "CPM"), 
+  knn_mat_method = knn_annoy, knn_graph_method = knn_to_geodesic_graph, seed = 12531, ...
+) {
   assert_has_package("igraph")
   # Set seed without permanently changing seed state
+  if (rlang::is_missing(snn)) return(create_partial())
+  snn <- convert_mat_to_cluster_matrix(
+    snn, required_mat_type = "adjacency", knn_mat_method = knn_mat_method,
+    knn_graph_method = knn_graph_method
+  )
   prev_seed <- get_seed()
   on.exit(restore_seed(prev_seed), add = TRUE)
   set.seed(seed)
@@ -160,9 +231,18 @@ cluster_graph_leiden <- function(snn, resolution = 1, objective_function = c("mo
 #' @rdname cluster
 #' @details **cluster_graph_louvain**: Louvain graph clustering algorithm `igraph::cluster_louvain()`
 #' @export
-cluster_graph_louvain <- function(snn, resolution = 1, seed = 12531) {
+cluster_graph_louvain <- function(
+  snn, resolution = 1, knn_mat_method = knn_annoy, 
+  knn_graph_method = knn_to_geodesic_graph, seed = 12531
+) {
   assert_has_package("igraph")
   # Set seed without permanently changing seed state
+  if (rlang::is_missing(snn)) return(create_partial())
+  snn <- convert_mat_to_cluster_matrix(
+    snn, required_mat_type = "adjacency", knn_mat_method = knn_mat_method,
+    knn_graph_method = knn_graph_method
+  )
+
   prev_seed <- get_seed()
   on.exit(restore_seed(prev_seed), add = TRUE)
   set.seed(seed)
@@ -176,8 +256,16 @@ cluster_graph_louvain <- function(snn, resolution = 1, seed = 12531) {
 #' @rdname cluster
 #' @details **cluster_graph_seurat**: Seurat's clustering algorithm `Seurat::FindClusters()`
 #' @export
-cluster_graph_seurat <- function(snn, resolution = 0.8, ...) {
+cluster_graph_seurat <- function(
+  snn, resolution = 0.8, knn_mat_method = knn_annoy, 
+  knn_graph_method = knn_to_geodesic_graph, ...
+) {
   assert_has_package("Seurat")
+  if (rlang::is_missing(snn)) return(create_partial())
+  snn <- convert_mat_to_cluster_matrix(
+    snn, required_mat_type = "adjacency", knn_mat_method = knn_mat_method,
+    knn_graph_method = knn_graph_method
+  )
   Seurat::as.Graph(snn) %>%
     Seurat::FindClusters(resolution = resolution, ...) %>%
     .[[1]]
@@ -238,6 +326,8 @@ cluster_membership_matrix <- function(groups, group_order = NULL) {
 #' @export
 knn_hnsw <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine"), verbose = TRUE, threads = 1, ef = 100) {
   metric <- match.arg(metric)
+  assert_is(verbose, "logical")
+  assert_is_wholenumber(threads)
   if (rlang::is_missing(data)) return(create_partial())
   index <- RcppHNSW::hnsw_build(
     data,
@@ -264,6 +354,7 @@ knn_hnsw <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine
 
   rownames(res$idx) <- rownames(data)
   rownames(res$dist) <- rownames(data)
+  attr(res, "mat_type") <- "knn"
   return(res)
 }
 
@@ -296,5 +387,7 @@ knn_annoy <- function(data, query = data, k = 10, metric = c("euclidean", "cosin
     dist[i, ] <- res$dist
   }
   if (metric == "cosine") dist <- 0.5 * (dist * dist)
-  list(idx = idx, dist = dist)
+  res <- list(idx = idx, dist = dist)
+  attr(res, "mat_type") <- "knn"
+  return(res)
 }
diff --git a/r/man/cluster.Rd b/r/man/cluster.Rd
index ce47c1db..0a8e2350 100644
--- a/r/man/cluster.Rd
+++ b/r/man/cluster.Rd
@@ -10,13 +10,27 @@ cluster_graph_leiden(
   snn,
   resolution = 1,
   objective_function = c("modularity", "CPM"),
+  knn_mat_method = knn_annoy,
+  knn_graph_method = knn_to_geodesic_graph,
   seed = 12531,
   ...
 )
 
-cluster_graph_louvain(snn, resolution = 1, seed = 12531)
+cluster_graph_louvain(
+  snn,
+  resolution = 1,
+  knn_mat_method = knn_annoy,
+  knn_graph_method = knn_to_geodesic_graph,
+  seed = 12531
+)
 
-cluster_graph_seurat(snn, resolution = 0.8, ...)
+cluster_graph_seurat(
+  snn,
+  resolution = 0.8,
+  knn_mat_method = knn_annoy,
+  knn_graph_method = knn_to_geodesic_graph,
+  ...
+)
 }
 \arguments{
 \item{snn}{Symmetric adjacency matrix (dgCMatrix) output from e.g. \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}. Only the lower triangle is used}
@@ -26,6 +40,13 @@ cluster_graph_seurat(snn, resolution = 0.8, ...)
 \item{objective_function}{Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below).
 For the meaning of each option, see \code{igraph::cluster_leiden()}.}
 
+\item{knn_mat_method}{(function) if snn represents a regular non-knn matrix, this function will attempt to convert it a knn matrix.
+Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn matrix.}
+
+\item{knn_graph_method}{(function) if snn represents a knn matrix, this function will attempt to convert it to a graph matrix.
+Must be a (optionally partialized) version of \code{knn_to_graph()}, \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}.
+Ignored if snn is already a graph matrix.}
+
 \item{seed}{Random seed for clustering initialization}
 
 \item{...}{Additional arguments to underlying clustering function}
diff --git a/r/man/convert_mat_to_cluster_matrix.Rd b/r/man/convert_mat_to_cluster_matrix.Rd
new file mode 100644
index 00000000..fc278e5d
--- /dev/null
+++ b/r/man/convert_mat_to_cluster_matrix.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/clustering.R
+\name{convert_mat_to_cluster_matrix}
+\alias{convert_mat_to_cluster_matrix}
+\title{Convert a matrix to a required input type for a clustering step}
+\usage{
+convert_mat_to_cluster_matrix(
+  mat,
+  required_mat_type = c("knn", "adjacency"),
+  knn_mat_method = knn_annoy,
+  knn_graph_method = knn_to_geodesic_graph
+)
+}
+\arguments{
+\item{mat}{Input matrix to be converted}
+
+\item{required_mat_type}{(character) Type of matrix required for clustering. Must be one of "adjacency" or "knn".}
+
+\item{knn_mat_method}{(function) Function to convert input matrix to knn matrix. Must be a (optionally partialized)
+version of \code{knn_hnsw()} or \code{knn_annoy()}.  Unused if mat is already a knn or graph matrix.}
+
+\item{knn_graph_method}{(function) Function to convert input matrix to knn graph. Must be a (optionally partialized)
+version of \code{knn_to_graph()}, \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}.  Unused if required_mat_type is "knn",
+or if mat is already a graph matrix.}
+}
+\description{
+Ensure that if a user is using a clustering function, the input matrix is converted to a knn or graph matrix.
+Returns the input matrix if it is already the correct type, otherwise converts it to the correct type.
+If the required matrix is an adjacency matrix, the input matrix will optionally be converted to a knn matrix if it is not one already.
+}
+\details{
+Clustering steps in BPCells use the following pattern:
+\enumerate{
+\item The input matrix is converted to a knn matrix using a function named \verb{knn_\{hnsw, annoy\}()}
+\item The knn matrix is converted to a graph matrix using one of the functions named \verb{knn_to_*_graph()}
+\item The graph matrix is clustered using a function named \verb{cluster_graph_*()}.
+}
+
+Each one of these steps requires a specific type of matrix as input.
+The typing of the input matrices is determined by the \code{mat_type} attribute given by the knn_to_*_graph() and \verb{knn_\{hnsw, annoy\}()}
+functions,as returned data types vary between all clustering functions.
+}
+\keyword{internal}
diff --git a/r/man/knn_graph.Rd b/r/man/knn_graph.Rd
index 5d365c41..6496717f 100644
--- a/r/man/knn_graph.Rd
+++ b/r/man/knn_graph.Rd
@@ -6,16 +6,27 @@
 \alias{knn_to_geodesic_graph}
 \title{K Nearest Neighbor (KNN) Graph}
 \usage{
-knn_to_graph(knn, use_weights = FALSE, self_loops = TRUE)
+knn_to_graph(
+  knn,
+  use_weights = FALSE,
+  self_loops = TRUE,
+  knn_mat_method = knn_annoy
+)
 
 knn_to_snn_graph(
   knn,
   min_val = 1/15,
   self_loops = FALSE,
-  return_type = c("matrix", "list")
+  return_type = c("matrix", "list"),
+  knn_mat_method = knn_annoy
 )
 
-knn_to_geodesic_graph(knn, return_type = c("matrix", "list"), threads = 0L)
+knn_to_geodesic_graph(
+  knn,
+  return_type = c("matrix", "list"),
+  threads = 0L,
+  knn_mat_method = knn_annoy
+)
 }
 \arguments{
 \item{knn}{List of 2 matrices -- idx for cell x K neighbor indices,
@@ -25,6 +36,9 @@ dist for cell x K neighbor distances}
 
 \item{self_loops}{Whether to allow self-loops in the output graph}
 
+\item{knn_mat_method}{(function) if knn is not a knn matrix, this function will attempt to convert it to one.
+Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn matrix.}
+
 \item{min_val}{minimum jaccard index between neighbors. Values below this will
 round to 0}
 
diff --git a/r/tests/testthat/test-clustering.R b/r/tests/testthat/test-clustering.R
index b17723c2..cc98160b 100644
--- a/r/tests/testthat/test-clustering.R
+++ b/r/tests/testthat/test-clustering.R
@@ -19,11 +19,11 @@ test_that("C++ SNN calculation works",{
                 neighbor_sim[i,] <- sample.int(cells, k)
             }
         }
-
+        input <- list(idx = neighbor_sim)
+        attr(input, "mat_type") <- "knn"
         min_val <- 1/15
-        snn <- knn_to_snn_graph(list(idx=neighbor_sim), min_val=min_val, self_loops=TRUE)
-
-        mat <- knn_to_graph(list(idx=neighbor_sim), use_weights=FALSE)
+        snn <- knn_to_snn_graph(input, min_val=min_val, self_loops=TRUE)
+        mat <- knn_to_graph(input, use_weights=FALSE)
 
         mat <- mat %*% t(mat)
         mat <- mat / (2 * k - mat)
@@ -31,13 +31,16 @@ test_that("C++ SNN calculation works",{
         # Prune the explicit 0 entries from storage
         mat <- Matrix::drop0(mat) 
         mat <- Matrix::tril(mat)
+        attr(snn, "mat_type") <- NULL
         expect_identical(
             snn,
             as(mat, "dgCMatrix")
         )
-        snn2 <- knn_to_snn_graph(list(idx=neighbor_sim), min_val=min_val, self_loops=FALSE)
+        
+        snn2 <- knn_to_snn_graph(input, min_val=min_val, self_loops=FALSE)
         diag(mat) <- 0
         mat <- Matrix::drop0(mat)
+        attr(snn2, "mat_type") <- NULL
         expect_identical(
             snn2,
             as(mat, "dgCMatrix")

From 225ef0022e45b7f5121cbbdff9fc1887f2d8ea34 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 12 Feb 2025 23:31:12 -0800
Subject: [PATCH 079/142] [r] revert select_features_accessibility to be mean,
 change LSI to work with clustering changes

---
 r/NAMESPACE                              |  2 +-
 r/NEWS.md                                |  2 +-
 r/R/singlecell_utils.R                   | 89 ++++++++++++------------
 r/man/IterativeLSI.Rd                    | 36 +++++-----
 r/man/feature_selection.Rd               | 23 +++---
 r/tests/testthat/test-singlecell_utils.R |  4 +-
 6 files changed, 79 insertions(+), 77 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 60362432..9cfed020 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -117,9 +117,9 @@ export(rowVars.default)
 export(sctransform_pearson)
 export(select_cells)
 export(select_chromosomes)
-export(select_features_accessibility)
 export(select_features_binned_dispersion)
 export(select_features_dispersion)
+export(select_features_mean)
 export(select_features_variance)
 export(select_regions)
 export(set_trackplot_height)
diff --git a/r/NEWS.md b/r/NEWS.md
index 62ecb282..f12b7042 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -12,7 +12,7 @@ Contributions welcome :)
 - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166)
 - Add normalization helper functions `normalize_log()` and `normalize_tfidf()` (pull request #168)
 - Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #189)
-- Add feature selection functions `select_features_variance()`, and `select_features_{dispersion,accessibility,binned_dispersion}()`, with parameterization for normalization steps, and number of variable features (pull request #189)
+- Add feature selection functions `select_features_variance()`, and `select_features_{dispersion,mean,binned_dispersion}()`, with parameterization for normalization steps, and number of variable features (pull request #189)
 - Add `LSI()` and `IterativeLSI()` dimensionality functions to perform latent semantic indexing on a matrix (pull request #189).
 
 ## Improvements
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 8c63199e..3182c69b 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -14,8 +14,8 @@
 #' Feature selection functions
 #' 
 #' Apply a feature selection method to a non-normalized `(features x cells)` matrix.  We recommend using counts matrices as input and 
-#' apply any normalizations prior to feature selection via the normalize argument (if available).  The output of these functions is a dataframe that has columns that
-#' at the minimum include the feature names and a score for each feature. 
+#' apply any normalizations prior to feature selection via the normalize argument (if available).  Instead of directly subsetting the input matrix,
+#' an output dataframe is provided indicating which features are highly variable, and the scoring of each feature.
 #' @rdname feature_selection
 #' @param mat (IterableMatrix) dimensions features x cells
 #' @param num_feats (float) Number of features to mark as highly_variable. If 0 < num_feats < 1, then interpret it as a fraction of features.
@@ -38,15 +38,15 @@
 #' mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
 #' rownames(mat) <- paste0("gene", seq_len(nrow(mat)))
 #' mat
-#'
+#' mat <- as(mat, "IterableMatrix")
 #' select_features_variance(
 #'     mat, 
 #'     num_feats=2, 
 #'     normalize=normalize_log
 #' )
 #' 
-#' # Because of how the BPCells normalize functions behave when the matrix 
-#' # argument is missing, we can also customize the normalization parameters:
+#' # Because of how the BPCells `normalize` functions behave when the matrix 
+#' # argument is missing, we can also customize the normalization parameters using partial arguments:
 #' select_features_variance(
 #'     mat,
 #'     num_feats=2,
@@ -61,7 +61,7 @@ select_features_variance <- function(
 ) {
   assert_greater_than_zero(num_feats)
   assert_len(num_feats, 1)
-  assert_is(num_feats, "numeric")
+  assert_is_wholenumber(threads)
   if (rlang::is_missing(mat)) return(create_partial())
   assert_is_mat(mat)
   
@@ -82,7 +82,7 @@ select_features_variance <- function(
 
 #' @rdname feature_selection
 #' @returns
-#'  - `select_features_dispersion`: \eqn{\mathrm{Score}(x_i) = \frac{\frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}{\bar{x}_i}}
+#' - `select_features_dispersion`: \eqn{\mathrm{Score}(x_i) = \frac{\frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}{\bar{x}_i}}
 #' @export
 select_features_dispersion <- function(
   mat, num_feats = 0.05, 
@@ -91,7 +91,7 @@ select_features_dispersion <- function(
 ) {
   assert_greater_than_zero(num_feats)
   assert_len(num_feats, 1)
-  assert_is(num_feats, "numeric")
+  assert_is_wholenumber(threads)
   if (rlang::is_missing(mat)) return(create_partial())
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
   if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
@@ -110,14 +110,13 @@ select_features_dispersion <- function(
   return(features_df)
 }
 
-#where \eqn{x_ij} = 1 & \text{if } x_{ij} == 0 \\ 0 & \text{otherwise} \end{cases}
 #' @rdname feature_selection
 #' @returns
-#' - `select_features_accessibility`: \eqn{\mathrm{Score}(x_i) = \sum_{j=1}^{n} \bigl({x}_{ij}^{\mathrm{binarized}})\bigr)},  where \eqn{x_{ij}^{\mathrm{binarized}}} is defined as \eqn{1} if \eqn{x_{ij} != 0} and \eqn{0} otherwise.
+#' - `select_features_mean`: \eqn{\mathrm{Score}(x_i) = \frac{\sum_{j=1}^{n}\bigl(x_{ij}\bigr)}{n}}
 #' @export
-select_features_accessibility <- function(mat, num_feats = 0.05, normalize = NULL, threads = 1L) {
+select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, threads = 1L) {
   assert_greater_than_zero(num_feats)
-  assert_is(num_feats, "numeric")
+  assert_is_wholenumber(threads)
   if (rlang::is_missing(mat)) return(create_partial())
   assert_is_mat(mat)
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
@@ -129,7 +128,7 @@ select_features_accessibility <- function(mat, num_feats = 0.05, normalize = NUL
   # get the sum of each feature, binarized
   features_df <- tibble::tibble(
     feature = rownames(mat),
-    score = matrix_stats(mat, row_stats = "nonzero", threads = threads)$row_stats["nonzero", ]
+    score = matrix_stats(mat, row_stats = "mean", threads = threads)$row_stats["mean", ]
   ) %>%
     dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats)
   return(features_df)
@@ -145,7 +144,7 @@ select_features_accessibility <- function(mat, num_feats = 0.05, normalize = NUL
 #'  2. Calculate dispersion of each feature as `log(variance / mean)`
 #'  3. Z-score normalize dispersion within each bin, and select highest normalized dispersion across all bins
 #' 
-#' If the number of features within a bin is equal to 1, then dhe mean dispersion for that bin is set to 1.
+#' If the number of features within a bin is equal to 1, then the mean dispersion for that bin is set to 1.
 #' 
 #' This should be equivalent to `Seurat::FindVariableFeatures()` with `selection.method="mean.var.plot"`
 #'  and `scanpy.pp.highly_variable_genes()` with `flavor="seurat"`.
@@ -159,6 +158,7 @@ select_features_binned_dispersion <- function(
   assert_is_wholenumber(n_bins)
   assert_len(n_bins, 1)
   assert_greater_than_zero(n_bins)
+  assert_is_wholenumber(threads)
   if (rlang::is_missing(mat)) return(create_partial())
   assert_is_mat(mat)
   if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
@@ -359,52 +359,56 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 
 #' Run iterative LSI on a matrix.
 #' 
-#' Given a `(features x cells)` counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.  This uses the method described in [ArchR](https://doi.org/10.1038/s41588-021-00790-6) (Granja et al; 2019). 
-#' See details for more specific information.  Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space.
+#' Given a `(features x cells)` counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.  
+#' This uses the method described in [ArchR](https://doi.org/10.1038/s41588-021-00790-6) (Granja et al; 2019). 
+#' See details for more specific information.  Returns a DimReduction object, which allows for projection of matrices with the same features into the same latent space.
 #'
 #' @param mat (IterableMatrix) Counts matrix of shape `(features x cells)`.
 #' @param n_iterations (int) The number of LSI iterations to perform.
-#' @param first_feature_selection_method (function) Method to use for selecting features for the first iteration. Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_accessibility`, `select_features_binned_dispersion`
-#' @param feature_selection_method (function) Method to use for selecting features for each iteration after the first. Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_accessibility`, `select_features_binned_dispersion`
-#' @param knn_method (function) Method to use for obtaining a kNN matrix for determining clusters assignments of cells.  Current builtin options are `knn_hnsw()` and `knn_annoy()`.  The 
-#' user can pass in partial parameters to the knn method, such as by passing `knn_hnsw(ef = 500, k = 12)`
-#' @param cluster_method (function) Method to use for clustering a kNN matrix. Current builtin options are `cluster_graph_{leiden, louvain, seurat}()`
-#' @param lsi_method (function) Method to use for LSI.  Only `LSI` is allowed.  The user can pass in partial parameters to `LSI` to customize the LSI method, such as by passing `LSI(n_dimensions = 30, corr_cutoff = 0.5)`.
+#' @param first_feature_selection_method (function) Method to use for selecting features for the first iteration. 
+#' Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_mean`, `select_features_binned_dispersion`
+#' @param feature_selection_method (function) Method to use for selecting features for each iteration after the first. 
+#' Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_mean`, `select_features_binned_dispersion`
+#' @param cluster_method (function) Method to use for clustering a kNN matrix. 
+#' Current builtin options are `cluster_graph_{leiden, louvain, seurat}()`.
+#' The user can pass in partial parameters to the cluster method, such as by passing 
+#' `cluster_graph_leiden(resolution = 0.5, knn_mat_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))`
+#' @param lsi_method (function) Method to use for LSI.  Only `LSI` is allowed.  The user can pass in partial parameters to `LSI` to customize the LSI method, 
+#' such as by passing `LSI(n_dimensions = 30, corr_cutoff = 0.5)`.
 #' @return An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data
 #' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
-#'  - `first_feature_selection_method`: The method used for selecting features for the first iteration
-#'  - `lsi_method`: The method used for LSI
-#'  - `knn_method`: The method used for obtaining a kNN matrix
-#'  - `cluster_method`: The method used for clustering
-#'  - `feature_means`: The means of the features used for tf-idf normalization
-#'  - `iterations`: The number of LSI iterations ran
-#'  - `iter_info`: A tibble with the following columns:
-#'    - `iteration`: The iteration number
-#'    - `feature_names`: The names of the features used for the iteration
-#'    - `lsi_results`: The results of LSI for the iteration.  This follows the same structure as the `fitted_params` attribute of the `LSI` object, but information such as the `v` and `d` matrices are removed.
-#'    - `clusters`: The clusters for the iteration.  This is blank for the first iteration
+#'    - `first_feature_selection_method`: The method used for selecting features for the first iteration
+#'    - `lsi_method`: The method used for LSI
+#'    - `cluster_method`: The method used for clustering
+#'    - `feature_means`: The means of the features used for tf-idf normalization
+#'    - `iterations`: The number of LSI iterations ran
+#'    - `iter_info`: A tibble with the following columns:
+#'        - `iteration`: The iteration number
+#'        - `feature_names`: The names of the features used for the iteration
+#'        - `lsi_results`: The results of LSI for the iteration.  This follows the same structure as the `fitted_params` attribute of the `LSI` object, but information such as the `v` and `d` matrices are removed.
+#'        - `clusters`: The clusters for the iteration.  This is blank for the first iteration
 #' @details
 #' The iterative LSI method is as follows:
 #' - First iteration:
 #'    - Select features based on the `first_feature_selection_method` argument
 #'    - Perform LSI on the selected features
 #'    - If `n_iterations` is 1, return the projected data from the first PCA projection
-#'    - Else, turn the LSI results into a kNN matrix using `knn_method`, then cluster the kNN matrix using `cluster_method`
+#'    - Else, cluster the LSI results using `cluster_method`
 #' - For each subsequent iteration:
 #'    - Pseudobulk the clusters and select the top features based on the variance of the pseudobulked clusters
 #'    - Perform LSI on the selected features
 #'    - If this is the final iteration, return the projected data from this PCA projection
-#'    - Else, turn the LSI results into a kNN matrix using `knn_method`, then cluster the kNN matrix using `cluster_method`
+#'    - Else, cluster the LSI results using `cluster_method`
 #' 
-#' There are some minor differences when compared to the ArchR implementation.  Firstly, the ArchR implementation uses a different method for selecting features in the first iteration.  ArchR utilizes top accessibility as the default, while this implementation uses binned dispersion as the default.
-#' `select_features_accessibility()` can be passed in for the `first_feature_selection_method` argument to mimic the ArchR implementation.
+#' There are some minor differences when compared to the ArchR implementation.  Firstly, the ArchR implementation uses a different method for selecting features in the first iteration.  
+#' `select_features_mean(normalize = binarize)` can be passed in for the `first_feature_selection_method` argument to mimic the ArchR implementation.
 #' 
 #' Secondly, the ArchR implementation calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
 #' which BPCells does not encounter even with a non-subsetted matrix.
 #' @seealso `LSI()` `DimReduction()` `knn_hnsw()` `knn_annoy()` 
 #' `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` `select_features_variance()` `select_features_dispersion()` 
-#' `select_features_accessibility()` `select_features_binned_dispersion()`
+#' `select_features_mean()` `select_features_binned_dispersion()`
 #' @inheritParams LSI
 #' @export
 IterativeLSI <- function(
@@ -413,20 +417,19 @@ IterativeLSI <- function(
   first_feature_selection_method = select_features_binned_dispersion,
   feature_selection_method = select_features_dispersion,
   lsi_method = LSI,
-  knn_method = knn_hnsw,
   cluster_method = cluster_graph_leiden,
   threads = 1L, verbose = FALSE
 ) {
   assert_has_package("RcppHNSW")
   assert_is_mat(mat)
-  assert_true(n_iterations > 0)
+  assert_greater_than_zero(n_iterations)
   assert_is_wholenumber(n_iterations)
   assert_is_wholenumber(threads)
+  assert_is(verbose, "logical")
   
-  fitted_params = list(
+  fitted_params <- list(
     first_feature_selection_method = first_feature_selection_method,
     lsi_method = lsi_method,
-    knn_method = knn_method,
     cluster_method = cluster_method,
     feature_means = matrix_stats(mat, row_stats = "mean", threads = threads)$row_stats["mean",],
     iterations = n_iterations,
@@ -471,7 +474,7 @@ IterativeLSI <- function(
     if (i == n_iterations) break 
     # cluster the LSI results
     if (verbose) log_progress("Clustering LSI results")
-    clustering_res <- t(lsi_res_obj$cell_embeddings) %>% partial_apply(knn_method, threads = threads)()  %>% knn_to_snn_graph() %>% cluster_method()
+    clustering_res <- t(lsi_res_obj$cell_embeddings) %>% partial_apply(cluster_method, threads = threads)()
     fitted_params$iter_info$clusters[[i]] <- clustering_res
     # pseudobulk and pass onto next iteration
     if (verbose) log_progress("Pseudobulking matrix")
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index e45dd98b..b3876031 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -10,7 +10,6 @@ IterativeLSI(
   first_feature_selection_method = select_features_binned_dispersion,
   feature_selection_method = select_features_dispersion,
   lsi_method = LSI,
-  knn_method = knn_hnsw,
   cluster_method = cluster_graph_leiden,
   threads = 1L,
   verbose = FALSE
@@ -21,16 +20,19 @@ IterativeLSI(
 
 \item{n_iterations}{(int) The number of LSI iterations to perform.}
 
-\item{first_feature_selection_method}{(function) Method to use for selecting features for the first iteration. Current builtin options are \code{select_features_variance}, \code{select_features_dispersion}, \code{select_features_accessibility}, \code{select_features_binned_dispersion}}
+\item{first_feature_selection_method}{(function) Method to use for selecting features for the first iteration.
+Current builtin options are \code{select_features_variance}, \code{select_features_dispersion}, \code{select_features_mean}, \code{select_features_binned_dispersion}}
 
-\item{feature_selection_method}{(function) Method to use for selecting features for each iteration after the first. Current builtin options are \code{select_features_variance}, \code{select_features_dispersion}, \code{select_features_accessibility}, \code{select_features_binned_dispersion}}
+\item{feature_selection_method}{(function) Method to use for selecting features for each iteration after the first.
+Current builtin options are \code{select_features_variance}, \code{select_features_dispersion}, \code{select_features_mean}, \code{select_features_binned_dispersion}}
 
-\item{lsi_method}{(function) Method to use for LSI.  Only \code{LSI} is allowed.  The user can pass in partial parameters to \code{LSI} to customize the LSI method, such as by passing \code{LSI(n_dimensions = 30, corr_cutoff = 0.5)}.}
+\item{lsi_method}{(function) Method to use for LSI.  Only \code{LSI} is allowed.  The user can pass in partial parameters to \code{LSI} to customize the LSI method,
+such as by passing \code{LSI(n_dimensions = 30, corr_cutoff = 0.5)}.}
 
-\item{knn_method}{(function) Method to use for obtaining a kNN matrix for determining clusters assignments of cells.  Current builtin options are \code{knn_hnsw()} and \code{knn_annoy()}.  The
-user can pass in partial parameters to the knn method, such as by passing \code{knn_hnsw(ef = 500, k = 12)}}
-
-\item{cluster_method}{(function) Method to use for clustering a kNN matrix. Current builtin options are \verb{cluster_graph_\{leiden, louvain, seurat\}()}}
+\item{cluster_method}{(function) Method to use for clustering a kNN matrix.
+Current builtin options are \verb{cluster_graph_\{leiden, louvain, seurat\}()}.
+The user can pass in partial parameters to the cluster method, such as by passing
+\code{cluster_graph_leiden(resolution = 0.5, knn_mat_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))}}
 
 \item{threads}{(integer) Number of threads to use.}
 }
@@ -39,9 +41,9 @@ An object of class \code{c("IterativeLSI", "DimReduction")} with the following a
 \itemize{
 \item \code{cell_embeddings}: The projected data
 \item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
+\itemize{
 \item \code{first_feature_selection_method}: The method used for selecting features for the first iteration
 \item \code{lsi_method}: The method used for LSI
-\item \code{knn_method}: The method used for obtaining a kNN matrix
 \item \code{cluster_method}: The method used for clustering
 \item \code{feature_means}: The means of the features used for tf-idf normalization
 \item \code{iterations}: The number of LSI iterations ran
@@ -54,9 +56,11 @@ An object of class \code{c("IterativeLSI", "DimReduction")} with the following a
 }
 }
 }
+}
 \description{
-Given a \verb{(features x cells)} counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.  This uses the method described in \href{https://doi.org/10.1038/s41588-021-00790-6}{ArchR} (Granja et al; 2019).
-See details for more specific information.  Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space.
+Given a \verb{(features x cells)} counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
+This uses the method described in \href{https://doi.org/10.1038/s41588-021-00790-6}{ArchR} (Granja et al; 2019).
+See details for more specific information.  Returns a DimReduction object, which allows for projection of matrices with the same features into the same latent space.
 }
 \details{
 The iterative LSI method is as follows:
@@ -66,19 +70,19 @@ The iterative LSI method is as follows:
 \item Select features based on the \code{first_feature_selection_method} argument
 \item Perform LSI on the selected features
 \item If \code{n_iterations} is 1, return the projected data from the first PCA projection
-\item Else, turn the LSI results into a kNN matrix using \code{knn_method}, then cluster the kNN matrix using \code{cluster_method}
+\item Else, cluster the LSI results using \code{cluster_method}
 }
 \item For each subsequent iteration:
 \itemize{
 \item Pseudobulk the clusters and select the top features based on the variance of the pseudobulked clusters
 \item Perform LSI on the selected features
 \item If this is the final iteration, return the projected data from this PCA projection
-\item Else, turn the LSI results into a kNN matrix using \code{knn_method}, then cluster the kNN matrix using \code{cluster_method}
+\item Else, cluster the LSI results using \code{cluster_method}
 }
 }
 
-There are some minor differences when compared to the ArchR implementation.  Firstly, the ArchR implementation uses a different method for selecting features in the first iteration.  ArchR utilizes top accessibility as the default, while this implementation uses binned dispersion as the default.
-\code{select_features_accessibility()} can be passed in for the \code{first_feature_selection_method} argument to mimic the ArchR implementation.
+There are some minor differences when compared to the ArchR implementation.  Firstly, the ArchR implementation uses a different method for selecting features in the first iteration.
+\code{select_features_mean(normalize = binarize)} can be passed in for the \code{first_feature_selection_method} argument to mimic the ArchR implementation.
 
 Secondly, the ArchR implementation calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
 which BPCells does not encounter even with a non-subsetted matrix.
@@ -86,5 +90,5 @@ which BPCells does not encounter even with a non-subsetted matrix.
 \seealso{
 \code{LSI()} \code{DimReduction()} \code{knn_hnsw()} \code{knn_annoy()}
 \code{cluster_graph_leiden()} \code{cluster_graph_louvain()} \code{cluster_graph_seurat()} \code{select_features_variance()} \code{select_features_dispersion()}
-\code{select_features_accessibility()} \code{select_features_binned_dispersion()}
+\code{select_features_mean()} \code{select_features_binned_dispersion()}
 }
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index 61997713..18e1e338 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -3,7 +3,7 @@
 \name{select_features_variance}
 \alias{select_features_variance}
 \alias{select_features_dispersion}
-\alias{select_features_accessibility}
+\alias{select_features_mean}
 \alias{select_features_binned_dispersion}
 \title{Feature selection functions}
 \usage{
@@ -16,12 +16,7 @@ select_features_dispersion(
   threads = 1L
 )
 
-select_features_accessibility(
-  mat,
-  num_feats = 0.05,
-  normalize = NULL,
-  threads = 1L
-)
+select_features_mean(mat, num_feats = 0.05, normalize = NULL, threads = 1L)
 
 select_features_binned_dispersion(
   mat,
@@ -63,7 +58,7 @@ each feature \eqn{x_i} as follows:
 }
 
 \itemize{
-\item \code{select_features_accessibility}: \eqn{\mathrm{Score}(x_i) = \sum_{j=1}^{n} \bigl({x}_{ij}^{\mathrm{binarized}})\bigr)},  where \eqn{x_{ij}^{\mathrm{binarized}}} is defined as \eqn{1} if \eqn{x_{ij} != 0} and \eqn{0} otherwise.
+\item \code{select_features_mean}: \eqn{\mathrm{Score}(x_i) = \frac{\sum_{j=1}^{n}\bigl(x_{ij}\bigr)}{n}}
 }
 
 \itemize{
@@ -72,8 +67,8 @@ each feature \eqn{x_i} as follows:
 }
 \description{
 Apply a feature selection method to a non-normalized \verb{(features x cells)} matrix.  We recommend using counts matrices as input and
-apply any normalizations prior to feature selection via the normalize argument (if available).  The output of these functions is a dataframe that has columns that
-at the minimum include the feature names and a score for each feature.
+apply any normalizations prior to feature selection via the normalize argument (if available).  Instead of directly subsetting the input matrix,
+an output dataframe is provided indicating which features are highly variable, and the scoring of each feature.
 }
 \details{
 \code{select_features_binned_dispersion} implements the approach from Satija et al. 2015:
@@ -83,7 +78,7 @@ at the minimum include the feature names and a score for each feature.
 \item Z-score normalize dispersion within each bin, and select highest normalized dispersion across all bins
 }
 
-If the number of features within a bin is equal to 1, then dhe mean dispersion for that bin is set to 1.
+If the number of features within a bin is equal to 1, then the mean dispersion for that bin is set to 1.
 
 This should be equivalent to \code{Seurat::FindVariableFeatures()} with \code{selection.method="mean.var.plot"}
 and \code{scanpy.pp.highly_variable_genes()} with \code{flavor="seurat"}.
@@ -93,15 +88,15 @@ set.seed(12345)
 mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
 rownames(mat) <- paste0("gene", seq_len(nrow(mat)))
 mat
-
+mat <- as(mat, "IterableMatrix")
 select_features_variance(
     mat, 
     num_feats=2, 
     normalize=normalize_log
 )
 
-# Because of how the BPCells normalize functions behave when the matrix 
-# argument is missing, we can also customize the normalization parameters:
+# Because of how the BPCells `normalize` functions behave when the matrix 
+# argument is missing, we can also customize the normalization parameters using partial arguments:
 select_features_variance(
     mat,
     num_feats=2,
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 07677f19..f99792f2 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -23,7 +23,7 @@ generate_dense_matrix <- function(nrow, ncol) {
 
 test_that("select_features works general case", {
   m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix")
-  for (fn in c("select_features_variance", "select_features_dispersion", "select_features_accessibility")) {
+  for (fn in c("select_features_variance", "select_features_dispersion", "select_features_mean")) {
     res <- do.call(fn, list(m1, num_feats = 5))
     expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting
     expect_equal(sum(res$highly_variable), 5) # Only 10 features marked as highly variable
@@ -238,7 +238,7 @@ test_that("Iterative LSI works", {
   mat <- matrix(data = runif(50000, 0, 1), nrow=500, ncol = 100) %>% as("dgCMatrix") %>% as("IterableMatrix")
   rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
   colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
-  lsi_res_obj <- IterativeLSI(mat, n_iterations = 2, lsi_method = LSI(n_dimensions = 10))
+  lsi_res_obj <- expect_no_error(IterativeLSI(mat, n_iterations = 2, lsi_method = LSI(n_dimensions = 10)))
   lsi_res_proj <- project(lsi_res_obj, mat)
   lsi_res_embedding <- lsi_res_obj$cell_embeddings
   expect_equal(ncol(lsi_res_embedding), ncol(mat))

From 293f8c45cb272c186dc28b05aaa709b98310d806 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 12 Feb 2025 23:38:47 -0800
Subject: [PATCH 080/142] [r] change lsi clustering to work with github actions

---
 r/tests/testthat/test-singlecell_utils.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index f99792f2..8eb49b0f 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -238,7 +238,7 @@ test_that("Iterative LSI works", {
   mat <- matrix(data = runif(50000, 0, 1), nrow=500, ncol = 100) %>% as("dgCMatrix") %>% as("IterableMatrix")
   rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
   colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
-  lsi_res_obj <- expect_no_error(IterativeLSI(mat, n_iterations = 2, lsi_method = LSI(n_dimensions = 10)))
+  lsi_res_obj <- expect_no_error(IterativeLSI(mat, n_iterations = 2, lsi_method = LSI(n_dimensions = 10), cluster_method = cluster_graph_louvain(knn_mat_method = knn_hnsw)))
   lsi_res_proj <- project(lsi_res_obj, mat)
   lsi_res_embedding <- lsi_res_obj$cell_embeddings
   expect_equal(ncol(lsi_res_embedding), ncol(mat))

From d213e3900354e5f1261101c1ac6b2a272e7bfc89 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 13 Feb 2025 11:40:52 -0800
Subject: [PATCH 081/142] [r] fix clustering partials for knn_annoy reorder
 binned dispersion columns rename params in LSI, IterativeLSI

---
 r/R/clustering.R                         |  3 ++-
 r/R/singlecell_utils.R                   | 30 +++++++++++-------------
 r/man/IterativeLSI.Rd                    |  4 ++--
 r/man/LSI.Rd                             |  4 ++--
 r/man/knn.Rd                             |  2 +-
 r/tests/testthat/test-singlecell_utils.R | 21 +++++++++++++++++
 6 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/r/R/clustering.R b/r/R/clustering.R
index 6cd6a81c..3223e9db 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -363,9 +363,10 @@ knn_hnsw <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine
 #' @param n_trees Number of trees during index build time. More trees gives higher accuracy
 #' @param search_k Number of nodes to inspect during the query, or -1 for default value. Higher number gives higher accuracy
 #' @export
-knn_annoy <- function(data, query = data, k = 10, metric = c("euclidean", "cosine", "manhattan", "hamming"), n_trees = 50, search_k = -1) {
+knn_annoy <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine", "manhattan", "hamming"), n_trees = 50, search_k = -1) {
   metric <- match.arg(metric)
   if (rlang::is_missing(data)) return(create_partial())
+  if (is.null(query)) query <- data
   annoy <- switch(metric,
     "euclidean" = new(RcppAnnoy::AnnoyEuclidean, ncol(data)),
     "cosine" = new(RcppAnnoy::AnnoyAngular, ncol(data)),
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 3182c69b..facfb188 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -192,7 +192,7 @@ select_features_binned_dispersion <- function(
     ) %>%
     dplyr::ungroup() %>%
     dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats) %>% 
-    dplyr::select(c("feature", "dispersion", "bin",  "score", "highly_variable")) %>%
+    dplyr::select(c("feature", "score", "highly_variable", "dispersion", "bin")) %>%
     dplyr::rename("raw_log_dispersion" = "dispersion")
   return(features_df)
 }
@@ -256,12 +256,12 @@ project.default <- function(x, mat, ...) {
 #' @param scale_factor (numeric) Scaling factor to multiply matrix by prior to log normalization (see formulas below).
 #' @param threads (integer) Number of threads to use.
 #' @returns An object of class `c("LSI", "DimReduction")` with the following attributes:
-#' - `cell_embeddings`: The projected data
+#' - `x`: The projected data as a matrix of shape `(n_dimensions, ncol(mat))`
 #' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 #'   - `scale_factor`: The scale factor used for tf-idf normalization
 #'   - `feature_means`: The means of the features used for normalization
 #'   - `pcs_to_keep`: The PCs that were kept after filtering by correlation to sequencing depth
-#'   - `svd_params`: The matrix calculated for SVD
+#'   - `feature_loadings`: SVD component u with dimension `(n_variable_features, n_dimensions)`
 #' - `feature_names`: The names of the features in the matrix
 #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
 #' 
@@ -315,7 +315,7 @@ LSI <- function(
     scale_factor = scale_factor,
     feature_means = mat_stats$row_stats["mean", ],
     pcs_to_keep = pca_feats_to_keep,
-    svd_params = svd_attr
+    feature_loadings = svd_attr$u
   )
   res <- DimReduction(
     x = pca_res,
@@ -333,7 +333,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 
   fitted_params <- x$fitted_params
   # Do a check to make sure that the number of rows in the matrix is the same as the number of rows in SVD$u
-  assert_true(nrow(mat) == nrow(fitted_params$svd_params$u))
+  assert_true(nrow(mat) == nrow(fitted_params$feature_loadings))
   if (!is.null(rownames(mat)) && !is.null(x$feature_names)) {
     assert_true(all(x$feature_names %in% rownames(mat)))
     mat <- mat[x$feature_names, ]
@@ -348,8 +348,8 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
     convert_matrix_type(mat, type = "float"),
     tempfile("mat"), compress = TRUE
   )
-  pca_attr <- fitted_params$svd_params
-  res <- t(pca_attr$u) %*% mat
+  feature_loadings <- fitted_params$feature_loadings
+  res <- t(feature_loadings) %*% mat
   if (length(fitted_params$pcs_to_keep) != nrow(res)) {
     res <- res[fitted_params$pcs_to_keep, ]
   }
@@ -376,7 +376,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @param lsi_method (function) Method to use for LSI.  Only `LSI` is allowed.  The user can pass in partial parameters to `LSI` to customize the LSI method, 
 #' such as by passing `LSI(n_dimensions = 30, corr_cutoff = 0.5)`.
 #' @return An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
-#' - `cell_embeddings`: The projected data
+#' - `x`: The projected data
 #' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 #'    - `first_feature_selection_method`: The method used for selecting features for the first iteration
 #'    - `lsi_method`: The method used for LSI
@@ -386,7 +386,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #'    - `iter_info`: A tibble with the following columns:
 #'        - `iteration`: The iteration number
 #'        - `feature_names`: The names of the features used for the iteration
-#'        - `lsi_results`: The results of LSI for the iteration.  This follows the same structure as the `fitted_params` attribute of the `LSI` object, but information such as the `v` and `d` matrices are removed.
+#'        - `feature_loadings`: SVD component u with dimension `(n_dimensions, n_variable_features)`
 #'        - `clusters`: The clusters for the iteration.  This is blank for the first iteration
 #' @details
 #' The iterative LSI method is as follows:
@@ -420,13 +420,13 @@ IterativeLSI <- function(
   cluster_method = cluster_graph_leiden,
   threads = 1L, verbose = FALSE
 ) {
+  
   assert_has_package("RcppHNSW")
   assert_is_mat(mat)
   assert_greater_than_zero(n_iterations)
   assert_is_wholenumber(n_iterations)
   assert_is_wholenumber(threads)
   assert_is(verbose, "logical")
-  
   fitted_params <- list(
     first_feature_selection_method = first_feature_selection_method,
     lsi_method = lsi_method,
@@ -480,9 +480,7 @@ IterativeLSI <- function(
     if (verbose) log_progress("Pseudobulking matrix")
     pseudobulk_res <- pseudobulk_matrix(mat, clustering_res, threads = as.integer(threads))
     # Only take the SVD information required to project the matrix
-    fitted_params$iter_info$lsi_results[[i]]$svd_params <- list(
-      u = lsi_res_obj$fitted_params$svd_params$u
-    )
+    fitted_params$iter_info$lsi_results[[i]]$feature_loadings <- lsi_res_obj$fitted_params$feature_loadings
     rownames(pseudobulk_res) <- rownames(mat)
   }
   if (verbose) log_progress("Finished running Iterative LSI")
@@ -513,7 +511,7 @@ project.IterativeLSI <- function(x, mat, threads = 1L, ...) {
   }
   mat <- mat[mat_indices,]
   # Run LSI
-  # since we don't hold the LSI object, we copy the internal logic from `project.LSI()`
+  # since we don't hold the LSI object, copy the internal logic from `project.LSI()`
   lsi_attr <- attr(x$fitted_params$lsi_method, "args")
   
   mat <- normalize_tfidf(
@@ -527,8 +525,8 @@ project.IterativeLSI <- function(x, mat, threads = 1L, ...) {
     tempfile("mat"), compress = TRUE
   )
   
-  pca_attr <- last_iter_info$lsi_results[[1]]$svd_params
-  res <- t(pca_attr$u) %*% mat
+  feature_loadings <- last_iter_info$lsi_results[[1]]$feature_loadings
+  res <- t(feature_loadings) %*% mat
   if (length(last_iter_info$lsi_results[[1]]$pcs_to_keep) != nrow(res)) {
     res <- res[last_iter_info$lsi_results[[1]]$pcs_to_keep,]
   }
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index b3876031..b928e868 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -39,7 +39,7 @@ The user can pass in partial parameters to the cluster method, such as by passin
 \value{
 An object of class \code{c("IterativeLSI", "DimReduction")} with the following attributes:
 \itemize{
-\item \code{cell_embeddings}: The projected data
+\item \code{x}: The projected data
 \item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 \itemize{
 \item \code{first_feature_selection_method}: The method used for selecting features for the first iteration
@@ -51,7 +51,7 @@ An object of class \code{c("IterativeLSI", "DimReduction")} with the following a
 \itemize{
 \item \code{iteration}: The iteration number
 \item \code{feature_names}: The names of the features used for the iteration
-\item \code{lsi_results}: The results of LSI for the iteration.  This follows the same structure as the \code{fitted_params} attribute of the \code{LSI} object, but information such as the \code{v} and \code{d} matrices are removed.
+\item \code{feature_loadings}: SVD component u with dimension \verb{(n_dimensions, n_variable_features)}
 \item \code{clusters}: The clusters for the iteration.  This is blank for the first iteration
 }
 }
diff --git a/r/man/LSI.Rd b/r/man/LSI.Rd
index 0b7c4ad0..ecff92bb 100644
--- a/r/man/LSI.Rd
+++ b/r/man/LSI.Rd
@@ -28,13 +28,13 @@ the corr_cutoff, it will be excluded from the final PCA matrix.}
 \value{
 An object of class \code{c("LSI", "DimReduction")} with the following attributes:
 \itemize{
-\item \code{cell_embeddings}: The projected data
+\item \code{x}: The projected data
 \item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 \itemize{
 \item \code{scale_factor}: The scale factor used for tf-idf normalization
 \item \code{feature_means}: The means of the features used for normalization
 \item \code{pcs_to_keep}: The PCs that were kept after filtering by correlation to sequencing depth
-\item \code{svd_params}: The matrix calculated for SVD
+\item \code{feature_loadings}: SVD component u with dimension \verb{(n_variable_features, n_dimensions)}
 }
 \item \code{feature_names}: The names of the features in the matrix
 }
diff --git a/r/man/knn.Rd b/r/man/knn.Rd
index 3efbc9e7..5952e69b 100644
--- a/r/man/knn.Rd
+++ b/r/man/knn.Rd
@@ -17,7 +17,7 @@ knn_hnsw(
 
 knn_annoy(
   data,
-  query = data,
+  query = NULL,
   k = 10,
   metric = c("euclidean", "cosine", "manhattan", "hamming"),
   n_trees = 50,
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 8eb49b0f..833d11d7 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -244,4 +244,25 @@ test_that("Iterative LSI works", {
   expect_equal(ncol(lsi_res_embedding), ncol(mat))
   expect_equal(nrow(lsi_res_embedding), 10)
   expect_equal(lsi_res_embedding, lsi_res_proj)
+})
+
+test_that("Iterative LSI works with parameterized clustering", {
+  skip_if_not_installed("RcppAnnoy")
+  mat <- matrix(data = runif(50000, 0, 1), nrow=500, ncol = 100) %>% as("dgCMatrix") %>% as("IterableMatrix")
+  rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
+  colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
+  lsi_res_obj <- expect_no_error(
+    IterativeLSI(
+      mat, lsi_method = LSI(n_dimensions = 10),
+      cluster_method = cluster_graph_leiden(
+        knn_mat_method = knn_annoy(k = 12),
+        knn_graph_method = knn_to_snn_graph(min_val = 0.1)
+      )
+    )
+  )
+  lsi_res_proj <- project(lsi_res_obj, mat)
+  lsi_res_embedding <- lsi_res_obj$cell_embeddings
+  expect_equal(ncol(lsi_res_embedding), ncol(mat))
+  expect_equal(nrow(lsi_res_embedding), 10)
+  expect_equal(lsi_res_embedding, lsi_res_proj)
 })
\ No newline at end of file

From 68234f014090683ed739459472d39252ad058eae Mon Sep 17 00:00:00 2001
From: Immanuel Abdi <immanuelazn@gmail.com>
Date: Thu, 13 Feb 2025 16:57:09 -0800
Subject: [PATCH 082/142] [r] change clustering partials to use matrix/list
 type checking instead of attribute assignment

---
 r/R/clustering.R                       | 98 +++++++++++++++-----------
 r/man/cluster.Rd                       |  6 +-
 r/man/convert_mat_to_cluster_matrix.Rd | 45 ++++++------
 r/man/is_adjacency_matrix.Rd           | 16 +++++
 r/man/is_knn_matrix.Rd                 | 17 +++++
 r/man/knn_graph.Rd                     |  6 +-
 r/tests/testthat/test-clustering.R     |  6 +-
 7 files changed, 119 insertions(+), 75 deletions(-)
 create mode 100644 r/man/is_adjacency_matrix.Rd
 create mode 100644 r/man/is_knn_matrix.Rd

diff --git a/r/R/clustering.R b/r/R/clustering.R
index 3223e9db..b8679d79 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -7,41 +7,67 @@
 # except according to those terms.
 
 
-#' Convert a matrix to a required input type for a clustering step
-#' 
-#' Ensure that if a user is using a clustering function, the input matrix is converted to a knn or graph matrix.
-#' Returns the input matrix if it is already the correct type, otherwise converts it to the correct type.
-#' If the required matrix is an adjacency matrix, the input matrix will optionally be converted to a knn matrix if it is not one already.
-#' @param mat Input matrix to be converted
-#' @param required_mat_type (character) Type of matrix required for clustering. Must be one of "adjacency" or "knn".
-#' @param knn_mat_method (function) Function to convert input matrix to knn matrix. Must be a (optionally partialized) 
-#' version of `knn_hnsw()` or `knn_annoy()`.  Unused if mat is already a knn or graph matrix.
-#' @param knn_graph_method (function) Function to convert input matrix to knn graph. Must be a (optionally partialized)
-#' version of `knn_to_graph()`, `knn_to_snn_graph()` or `knn_to_geodesic_graph()`.  Unused if required_mat_type is "knn",
-#'  or if mat is already a graph matrix.
-#' @details 
-#' Clustering steps in BPCells use the following pattern:
-#' 1. The input matrix is converted to a knn matrix using a function named `knn_{hnsw, annoy}()`
-#' 2. The knn matrix is converted to a graph matrix using one of the functions named `knn_to_*_graph()`
-#' 3. The graph matrix is clustered using a function named `cluster_graph_*()`.
-#' 
-#' Each one of these steps requires a specific type of matrix as input.  
-#' The typing of the input matrices is determined by the `mat_type` attribute given by the knn_to_*_graph() and `knn_{hnsw, annoy}()` 
-#' functions,as returned data types vary between all clustering functions.
+#' Check if an input is a kNN output matrix
+#'
+#' knn matrix functions `knn_hnsw()` and `knn_annoy()` return a list of two matrices, `idx` and `dist`.
+#' These are used as inputs to create graph adjacency matrices for clustering.
+#' Assume any list with at least an 
+#' @return TRUE if the mat is a kNN matrix, FALSE otherwise
+#' @keywords internal
+is_knn_matrix <- function(mat) {
+  return(is(mat, "list") && all(c("idx", "dist") %in% names(mat)))
+}
+
+#' Check if an input is a graph adjacency matrix.
+#'
+#' Clustering function like `cluster_graph_leiden()` and `cluster_graph_louvain()` require a graph adjacency matrix as input.
+#' We assume that any `dgCMatrix` that is square is a graph adjacency matrix.
+#' @return TRUE if the mat is a graph adjacency matrix, FALSE otherwise
+#' @keywords internal
+is_adjacency_matrix <- function(mat) {
+  return(is(mat, "dgCMatrix") && nrow(mat) == ncol(mat))
+}
+
+#' Convert a matrix to the required input type for clustering
+#'
+#' Ensures that the input matrix is converted to the correct type (knn or adjacency)
+#' required by a clustering function. If the input is already of the correct type,
+#' it is returned as is.
+#'
+#' @param mat Input matrix to be converted.
+#' @param required_mat_type (character) Required matrix type: "adjacency" or "knn".
+#' @param knn_mat_method (function) Function to convert to a knn matrix (e.g., `knn_hnsw`, `knn_annoy`).
+#'   Ignored if `mat` is already a knn or graph matrix.
+#' @param knn_graph_method (function) Function to convert a knn matrix to a graph matrix
+#'   (e.g., `knn_to_geodesic_graph`). Ignored if `required_mat_type` is "knn" or if
+#'   `mat` is already a graph matrix.
+#' #' @details
+#' This function checks the type of the input matrix `mat`. `mat` is returned without modification if
+#' it is already the required type (adjacency or knn).
+#' If `mat` is a standard matrix and `required_mat_type` is "knn", then `knn_mat_method` is used to convert `mat` to a knn matrix.
+#' If `required_mat_type` is "adjacency", then `knn_mat_method` is used to first convert `mat` to a knn matrix, 
+#' then `knn_graph_method` is used to convert the knn matrix to an adjacency matrix.
+#' @return The converted matrix.
 #' @keywords internal
 convert_mat_to_cluster_matrix <- function(
   mat,
-  required_mat_type = c("knn", "adjacency"), 
-  knn_mat_method = knn_annoy,
+  required_mat_type = c("knn", "adjacency"),
+  knn_mat_method = knn_hnsw,
   knn_graph_method = knn_to_geodesic_graph
 ) {
   required_mat_type <- match.arg(required_mat_type)
-  if (is(mat, "matrix") && is.null(attr(mat, "mat_type"))) {
+  if (is(mat, "matrix")) {
     mat <- knn_mat_method(mat)
   }
-  if (required_mat_type == "adjacency" && attr(mat, "mat_type") == "knn") {
+  if (required_mat_type == "knn" && !is_knn_matrix(mat)) {
+    pretty_error(mat, "must be a knn matrix, or convertible to one", 1)
+  }
+  if (required_mat_type == "adjacency" && is_knn_matrix(mat)) {
     mat <- knn_graph_method(mat)
   }
+  if (required_mat_type == "adjacency" && !is_adjacency_matrix(mat)) {
+    pretty_error(mat, "must be a graph adjacency matrix, or convertible to one", 1)
+  }
   return(mat)
 }
 
@@ -63,7 +89,7 @@ convert_mat_to_cluster_matrix <- function(
 #' @return **knn_to_graph**
 #'   Sparse matrix (dgCMatrix) where `mat[i,j]` = distance from cell `i` to
 #'   cell `j`, or 0 if cell `j` is not in the K nearest neighbors of `i`
-knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE, knn_mat_method = knn_annoy) {
+knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE, knn_mat_method = knn_hnsw) {
   assert_is(use_weights, "logical")
   assert_is(self_loops, "logical")
   if (rlang::is_missing(knn)) return(create_partial())
@@ -84,7 +110,6 @@ knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE, knn_mat_me
   }
   rownames(mat) <- rownames(knn$idx)
   colnames(mat) <- rownames(knn$idx)
-  attr(mat, "mat_type") <- "graph"
   mat
 }
 
@@ -107,7 +132,7 @@ knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE, knn_mat_me
 #'      These correspond to the rows, cols, and values of non-zero entries in the lower triangle
 #'      adjacency matrix. `dim` is the total number of vertices (cells) in the graph 
 #' @export
-knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_type=c("matrix", "list"), knn_mat_method = knn_annoy) {
+knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_type=c("matrix", "list"), knn_mat_method = knn_hnsw) {
   return_type <- match.arg(return_type)
   assert_is(self_loops, "logical")
   if (rlang::is_missing(knn)) return(create_partial())
@@ -136,7 +161,6 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t
     i = snn$i + 1L, j = snn$j + 1L, x = snn$weight,
     dims = c(snn$dim, snn$dim)
   )
-  attr(res, "mat_type") <- "graph"
   return(res)
 }
 
@@ -162,7 +186,7 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t
 #'      These correspond to the rows, cols, and values of non-zero entries in the lower triangle
 #'      adjacency matrix. `dim` is the total number of vertices (cells) in the graph 
 #' @export
-knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=0L, knn_mat_method = knn_annoy) {
+knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=0L, knn_mat_method = knn_hnsw) {
   return_type <- match.arg(return_type)
   assert_is_wholenumber(threads)
   if (rlang::is_missing(knn)) return(create_partial())
@@ -171,17 +195,13 @@ knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=
   graph <- build_umap_graph_cpp(knn$dist, knn$idx, threads=threads)
   
   graph$dim <- nrow(knn$idx)
-  if (return_type == "list") {
-    attr(graph, "mat_type") <- "graph"
-    return(graph)
-  }
+  if (return_type == "list") return(graph)
   
   # Return as a sparse matrix
   res <- Matrix::sparseMatrix(
     i = graph$i + 1L, j = graph$j + 1L, x = graph$weight,
     dims = c(graph$dim, graph$dim)
   )
-  attr(res, "mat_type") <- "graph"
   return(res)
 }
 
@@ -206,7 +226,7 @@ knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=
 #' @export
 cluster_graph_leiden <- function(
   snn, resolution = 1, objective_function = c("modularity", "CPM"), 
-  knn_mat_method = knn_annoy, knn_graph_method = knn_to_geodesic_graph, seed = 12531, ...
+  knn_mat_method = knn_hnsw, knn_graph_method = knn_to_geodesic_graph, seed = 12531, ...
 ) {
   assert_has_package("igraph")
   # Set seed without permanently changing seed state
@@ -232,7 +252,7 @@ cluster_graph_leiden <- function(
 #' @details **cluster_graph_louvain**: Louvain graph clustering algorithm `igraph::cluster_louvain()`
 #' @export
 cluster_graph_louvain <- function(
-  snn, resolution = 1, knn_mat_method = knn_annoy, 
+  snn, resolution = 1, knn_mat_method = knn_hnsw, 
   knn_graph_method = knn_to_geodesic_graph, seed = 12531
 ) {
   assert_has_package("igraph")
@@ -257,7 +277,7 @@ cluster_graph_louvain <- function(
 #' @details **cluster_graph_seurat**: Seurat's clustering algorithm `Seurat::FindClusters()`
 #' @export
 cluster_graph_seurat <- function(
-  snn, resolution = 0.8, knn_mat_method = knn_annoy, 
+  snn, resolution = 0.8, knn_mat_method = knn_hnsw, 
   knn_graph_method = knn_to_geodesic_graph, ...
 ) {
   assert_has_package("Seurat")
@@ -354,7 +374,6 @@ knn_hnsw <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine
 
   rownames(res$idx) <- rownames(data)
   rownames(res$dist) <- rownames(data)
-  attr(res, "mat_type") <- "knn"
   return(res)
 }
 
@@ -389,6 +408,5 @@ knn_annoy <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosin
   }
   if (metric == "cosine") dist <- 0.5 * (dist * dist)
   res <- list(idx = idx, dist = dist)
-  attr(res, "mat_type") <- "knn"
   return(res)
 }
diff --git a/r/man/cluster.Rd b/r/man/cluster.Rd
index 0a8e2350..ea677ae5 100644
--- a/r/man/cluster.Rd
+++ b/r/man/cluster.Rd
@@ -10,7 +10,7 @@ cluster_graph_leiden(
   snn,
   resolution = 1,
   objective_function = c("modularity", "CPM"),
-  knn_mat_method = knn_annoy,
+  knn_mat_method = knn_hnsw,
   knn_graph_method = knn_to_geodesic_graph,
   seed = 12531,
   ...
@@ -19,7 +19,7 @@ cluster_graph_leiden(
 cluster_graph_louvain(
   snn,
   resolution = 1,
-  knn_mat_method = knn_annoy,
+  knn_mat_method = knn_hnsw,
   knn_graph_method = knn_to_geodesic_graph,
   seed = 12531
 )
@@ -27,7 +27,7 @@ cluster_graph_louvain(
 cluster_graph_seurat(
   snn,
   resolution = 0.8,
-  knn_mat_method = knn_annoy,
+  knn_mat_method = knn_hnsw,
   knn_graph_method = knn_to_geodesic_graph,
   ...
 )
diff --git a/r/man/convert_mat_to_cluster_matrix.Rd b/r/man/convert_mat_to_cluster_matrix.Rd
index fc278e5d..9e42e331 100644
--- a/r/man/convert_mat_to_cluster_matrix.Rd
+++ b/r/man/convert_mat_to_cluster_matrix.Rd
@@ -2,42 +2,39 @@
 % Please edit documentation in R/clustering.R
 \name{convert_mat_to_cluster_matrix}
 \alias{convert_mat_to_cluster_matrix}
-\title{Convert a matrix to a required input type for a clustering step}
+\title{Convert a matrix to the required input type for clustering}
 \usage{
 convert_mat_to_cluster_matrix(
   mat,
   required_mat_type = c("knn", "adjacency"),
-  knn_mat_method = knn_annoy,
+  knn_mat_method = knn_hnsw,
   knn_graph_method = knn_to_geodesic_graph
 )
 }
 \arguments{
-\item{mat}{Input matrix to be converted}
+\item{mat}{Input matrix to be converted.}
 
-\item{required_mat_type}{(character) Type of matrix required for clustering. Must be one of "adjacency" or "knn".}
+\item{required_mat_type}{(character) Required matrix type: "adjacency" or "knn".}
 
-\item{knn_mat_method}{(function) Function to convert input matrix to knn matrix. Must be a (optionally partialized)
-version of \code{knn_hnsw()} or \code{knn_annoy()}.  Unused if mat is already a knn or graph matrix.}
+\item{knn_mat_method}{(function) Function to convert to a knn matrix (e.g., \code{knn_hnsw}, \code{knn_annoy}).
+Ignored if \code{mat} is already a knn or graph matrix.}
 
-\item{knn_graph_method}{(function) Function to convert input matrix to knn graph. Must be a (optionally partialized)
-version of \code{knn_to_graph()}, \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}.  Unused if required_mat_type is "knn",
-or if mat is already a graph matrix.}
+\item{knn_graph_method}{(function) Function to convert a knn matrix to a graph matrix
+(e.g., \code{knn_to_geodesic_graph}). Ignored if \code{required_mat_type} is "knn" or if
+\code{mat} is already a graph matrix.
+#' @details
+This function checks the type of the input matrix \code{mat}. \code{mat} is returned without modification if
+it is already the required type (adjacency or knn).
+If \code{mat} is a standard matrix and \code{required_mat_type} is "knn", then \code{knn_mat_method} is used to convert \code{mat} to a knn matrix.
+If \code{required_mat_type} is "adjacency", then \code{knn_mat_method} is used to first convert \code{mat} to a knn matrix,
+then \code{knn_graph_method} is used to convert the knn matrix to an adjacency matrix.}
 }
-\description{
-Ensure that if a user is using a clustering function, the input matrix is converted to a knn or graph matrix.
-Returns the input matrix if it is already the correct type, otherwise converts it to the correct type.
-If the required matrix is an adjacency matrix, the input matrix will optionally be converted to a knn matrix if it is not one already.
-}
-\details{
-Clustering steps in BPCells use the following pattern:
-\enumerate{
-\item The input matrix is converted to a knn matrix using a function named \verb{knn_\{hnsw, annoy\}()}
-\item The knn matrix is converted to a graph matrix using one of the functions named \verb{knn_to_*_graph()}
-\item The graph matrix is clustered using a function named \verb{cluster_graph_*()}.
+\value{
+The converted matrix.
 }
-
-Each one of these steps requires a specific type of matrix as input.
-The typing of the input matrices is determined by the \code{mat_type} attribute given by the knn_to_*_graph() and \verb{knn_\{hnsw, annoy\}()}
-functions,as returned data types vary between all clustering functions.
+\description{
+Ensures that the input matrix is converted to the correct type (knn or adjacency)
+required by a clustering function. If the input is already of the correct type,
+it is returned as is.
 }
 \keyword{internal}
diff --git a/r/man/is_adjacency_matrix.Rd b/r/man/is_adjacency_matrix.Rd
new file mode 100644
index 00000000..3b01e66a
--- /dev/null
+++ b/r/man/is_adjacency_matrix.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/clustering.R
+\name{is_adjacency_matrix}
+\alias{is_adjacency_matrix}
+\title{Check if an input is a graph adjacency matrix.}
+\usage{
+is_adjacency_matrix(mat)
+}
+\value{
+TRUE if the mat is a graph adjacency matrix, FALSE otherwise
+}
+\description{
+Clustering function like \code{cluster_graph_leiden()} and \code{cluster_graph_louvain()} require a graph adjacency matrix as input.
+We assume that any \code{dgCMatrix} that is square is a graph adjacency matrix.
+}
+\keyword{internal}
diff --git a/r/man/is_knn_matrix.Rd b/r/man/is_knn_matrix.Rd
new file mode 100644
index 00000000..43a1dac9
--- /dev/null
+++ b/r/man/is_knn_matrix.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/clustering.R
+\name{is_knn_matrix}
+\alias{is_knn_matrix}
+\title{Check if an input is a kNN output matrix}
+\usage{
+is_knn_matrix(mat)
+}
+\value{
+TRUE if the mat is a kNN matrix, FALSE otherwise
+}
+\description{
+knn matrix functions \code{knn_hnsw()} and \code{knn_annoy()} return a list of two matrices, \code{idx} and \code{dist}.
+These are used as inputs to create graph adjacency matrices for clustering.
+Assume any list with at least an
+}
+\keyword{internal}
diff --git a/r/man/knn_graph.Rd b/r/man/knn_graph.Rd
index 6496717f..5f165b82 100644
--- a/r/man/knn_graph.Rd
+++ b/r/man/knn_graph.Rd
@@ -10,7 +10,7 @@ knn_to_graph(
   knn,
   use_weights = FALSE,
   self_loops = TRUE,
-  knn_mat_method = knn_annoy
+  knn_mat_method = knn_hnsw
 )
 
 knn_to_snn_graph(
@@ -18,14 +18,14 @@ knn_to_snn_graph(
   min_val = 1/15,
   self_loops = FALSE,
   return_type = c("matrix", "list"),
-  knn_mat_method = knn_annoy
+  knn_mat_method = knn_hnsw
 )
 
 knn_to_geodesic_graph(
   knn,
   return_type = c("matrix", "list"),
   threads = 0L,
-  knn_mat_method = knn_annoy
+  knn_mat_method = knn_hnsw
 )
 }
 \arguments{
diff --git a/r/tests/testthat/test-clustering.R b/r/tests/testthat/test-clustering.R
index cc98160b..0da83bc7 100644
--- a/r/tests/testthat/test-clustering.R
+++ b/r/tests/testthat/test-clustering.R
@@ -19,8 +19,7 @@ test_that("C++ SNN calculation works",{
                 neighbor_sim[i,] <- sample.int(cells, k)
             }
         }
-        input <- list(idx = neighbor_sim)
-        attr(input, "mat_type") <- "knn"
+        input <- list(idx = neighbor_sim, dist = matrix(runif(cells*k), nrow=cells))
         min_val <- 1/15
         snn <- knn_to_snn_graph(input, min_val=min_val, self_loops=TRUE)
         mat <- knn_to_graph(input, use_weights=FALSE)
@@ -31,16 +30,13 @@ test_that("C++ SNN calculation works",{
         # Prune the explicit 0 entries from storage
         mat <- Matrix::drop0(mat) 
         mat <- Matrix::tril(mat)
-        attr(snn, "mat_type") <- NULL
         expect_identical(
             snn,
             as(mat, "dgCMatrix")
         )
-        
         snn2 <- knn_to_snn_graph(input, min_val=min_val, self_loops=FALSE)
         diag(mat) <- 0
         mat <- Matrix::drop0(mat)
-        attr(snn2, "mat_type") <- NULL
         expect_identical(
             snn2,
             as(mat, "dgCMatrix")

From 5aa8fc25d9878301dbce4ce99577fe8769176ed4 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 17 Feb 2025 23:52:15 -0800
Subject: [PATCH 083/142] [r] update feature selection example

---
 r/R/singlecell_utils.R     | 4 +++-
 r/man/feature_selection.Rd | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index facfb188..16f332ea 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -47,11 +47,13 @@
 #' 
 #' # Because of how the BPCells `normalize` functions behave when the matrix 
 #' # argument is missing, we can also customize the normalization parameters using partial arguments:
-#' select_features_variance(
+#' variable_features <- select_features_variance(
 #'     mat,
 #'     num_feats=2,
 #'     normalize=normalize_log(scale_factor=20)
 #' ) 
+#' # One can then filter to only variable features using the subset operator:
+#' mat[variable_features$feature[variable_features$highly_variable],]
 #' @seealso `normalize_tfidf()` `normalize_log()`
 #' @export
 select_features_variance <- function(
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index 18e1e338..c2467bc6 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -97,11 +97,13 @@ select_features_variance(
 
 # Because of how the BPCells `normalize` functions behave when the matrix 
 # argument is missing, we can also customize the normalization parameters using partial arguments:
-select_features_variance(
+variable_features <- select_features_variance(
     mat,
     num_feats=2,
     normalize=normalize_log(scale_factor=20)
 ) 
+# One can then filter to only variable features using the subset operator:
+mat[variable_features$feature[variable_features$highly_variable],]
 }
 \seealso{
 \code{normalize_tfidf()} \code{normalize_log()}

From 84fef233ea2f9d78346843fb48b1429fc733e205 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Tue, 18 Feb 2025 01:49:56 -0800
Subject: [PATCH 084/142] [r] rewrite `partial_apply()` defaults, fix up docs

---
 r/R/clustering.R       |  2 +-
 r/R/singlecell_utils.R | 19 +++++++++++--------
 r/R/utils.R            |  9 +++------
 r/man/LSI.Rd           |  2 +-
 r/man/partial_apply.Rd |  6 +-----
 5 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/r/R/clustering.R b/r/R/clustering.R
index b8679d79..efa629d1 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -190,7 +190,7 @@ knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=
   return_type <- match.arg(return_type)
   assert_is_wholenumber(threads)
   if (rlang::is_missing(knn)) return(create_partial())
-  knn_mat_method <- partial_apply(knn_mat_method, threads = threads)
+  knn_mat_method <- partial_apply(knn_mat_method, threads = threads, , .missing_args_error = FALSE)
   knn <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_mat_method = knn_mat_method)
   graph <- build_umap_graph_cpp(knn$dist, knn$idx, threads=threads)
   
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 16f332ea..04290641 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -72,7 +72,7 @@ select_features_variance <- function(
     rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat))))
   }
   num_feats <- min(max(num_feats, 0), nrow(mat))
-  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
+  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads, .missing_args_error = FALSE)(mat)
   features_df <- tibble::tibble(
     feature = rownames(mat),
     score = matrix_stats(mat, row_stats = "variance", threads = threads)$row_stats["variance",]
@@ -102,7 +102,7 @@ select_features_dispersion <- function(
   num_feats <- min(max(num_feats, 0), nrow(mat))
   if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
   assert_is(mat, "IterableMatrix")
-  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
+  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads, .missing_args_error = FALSE)(mat)
   mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads)
   features_df <- tibble::tibble(
     feature = rownames(mat),
@@ -126,7 +126,7 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread
     rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat))))
   }
   num_feats <- min(max(num_feats, 0), nrow(mat))
-  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
+  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads, .missing_args_error = FALSE)(mat)
   # get the sum of each feature, binarized
   features_df <- tibble::tibble(
     feature = rownames(mat),
@@ -282,6 +282,7 @@ LSI <- function(
   assert_is_wholenumber(n_dimensions)
   assert_len(n_dimensions, 1)
   assert_greater_than_zero(n_dimensions)
+  
   assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
   assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1))
   assert_is_wholenumber(threads)
@@ -344,7 +345,8 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
     normalize_tfidf,
     feature_means = fitted_params$feature_means,
     scale_factor = fitted_params$scale_factor,
-    threads = threads
+    threads = threads,
+    .missing_args_error = TRUE
   )(mat)
   mat <- write_matrix_dir(
     convert_matrix_type(mat, type = "float"),
@@ -450,9 +452,9 @@ IterativeLSI <- function(
     # run variable feature selection
     if (verbose) log_progress("Selecting features")
     if (i == 1) {
-      variable_features <- partial_apply(first_feature_selection_method, threads = threads)(mat)
+      variable_features <- partial_apply(first_feature_selection_method, threads = threads, .missing_args_error = FALSE)(mat)
     } else {
-      variable_features <- partial_apply(feature_selection_method, threads = threads)(pseudobulk_res)
+      variable_features <- partial_apply(feature_selection_method, threads = threads, .missing_args_error = FALSE)(pseudobulk_res)
     }
     fitted_params$iter_info$feature_names[[i]] <- variable_features %>% dplyr::filter(highly_variable) %>% dplyr::pull(feature)
     
@@ -466,7 +468,8 @@ IterativeLSI <- function(
     lsi_res_obj <- partial_apply(
       lsi_method,
       threads = threads,
-      verbose = verbose
+      verbose = verbose,
+      .missing_args_error = FALSE
     )(mat[mat_indices,])
     fitted_params$iter_info$lsi_results[[i]] <- lsi_res_obj$fitted_params
     # remove the feature means from the lsi results as they are already calculated
@@ -476,7 +479,7 @@ IterativeLSI <- function(
     if (i == n_iterations) break 
     # cluster the LSI results
     if (verbose) log_progress("Clustering LSI results")
-    clustering_res <- t(lsi_res_obj$cell_embeddings) %>% partial_apply(cluster_method, threads = threads)()
+    clustering_res <- t(lsi_res_obj$cell_embeddings) %>% partial_apply(cluster_method, threads = threads, .missing_args_error = FALSE)()
     fitted_params$iter_info$clusters[[i]] <- clustering_res
     # pseudobulk and pass onto next iteration
     if (verbose) log_progress("Pseudobulking matrix")
diff --git a/r/R/utils.R b/r/R/utils.R
index 3133ed8c..911c8b64 100644
--- a/r/R/utils.R
+++ b/r/R/utils.R
@@ -98,23 +98,20 @@ create_partial <- function(missing_args=list()) {
 #'
 #' @param f A function
 #' @param ... Named arguments to `f`
-#' @param .overwrite (bool) If `f` is already an output from
-#'   `partial_apply()`, whether parameter re-definitions should
-#'   be ignored or overwrite the existing definitions
 #' @param .missing_args_error (bool) If `TRUE`, passing in arguments
 #'  that are not in the function's signature will raise an error, otherwise
 #'  they will be ignored
 #' @return A `bpcells_partial` object (a function with some extra attributes)
 #' @keywords internal
-partial_apply <- function(f, ..., .overwrite = TRUE, .missing_args_error = FALSE) {
+partial_apply <- function(f, ..., .missing_args_error = TRUE) {
   args <- rlang::list2(...)
 
   if (is(f, "bpcells_partial")) {
     prev_args <- attr(f, "args")
     for (a in names(prev_args)) {
-      if (!(.overwrite && a %in% names(args))) {
+      if (!(a %in% names(args))) {
         args[[a]] <- prev_args[[a]]
-      } 
+      }
     }
     f <- attr(f, "fn")
     function_name <- attr(f, "body")[[1]]
diff --git a/r/man/LSI.Rd b/r/man/LSI.Rd
index ecff92bb..067e2680 100644
--- a/r/man/LSI.Rd
+++ b/r/man/LSI.Rd
@@ -28,7 +28,7 @@ the corr_cutoff, it will be excluded from the final PCA matrix.}
 \value{
 An object of class \code{c("LSI", "DimReduction")} with the following attributes:
 \itemize{
-\item \code{x}: The projected data
+\item \code{x}: The projected data as a matrix of shape \verb{(n_dimensions, ncol(mat))}
 \item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 \itemize{
 \item \code{scale_factor}: The scale factor used for tf-idf normalization
diff --git a/r/man/partial_apply.Rd b/r/man/partial_apply.Rd
index abafbc0b..68cfdde2 100644
--- a/r/man/partial_apply.Rd
+++ b/r/man/partial_apply.Rd
@@ -4,17 +4,13 @@
 \alias{partial_apply}
 \title{Create partial function calls}
 \usage{
-partial_apply(f, ..., .overwrite = TRUE, .missing_args_error = FALSE)
+partial_apply(f, ..., .missing_args_error = TRUE)
 }
 \arguments{
 \item{f}{A function}
 
 \item{...}{Named arguments to \code{f}}
 
-\item{.overwrite}{(bool) If \code{f} is already an output from
-\code{partial_apply()}, whether parameter re-definitions should
-be ignored or overwrite the existing definitions}
-
 \item{.missing_args_error}{(bool) If \code{TRUE}, passing in arguments
 that are not in the function's signature will raise an error, otherwise
 they will be ignored}

From 7351a203216c76f77dee59eb34a36083a3aeccc6 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Tue, 18 Feb 2025 02:20:50 -0800
Subject: [PATCH 085/142] [r] clean up dimreductiondocs

---
 r/R/singlecell_utils.R | 44 +++++++++++++++++++++++++++---------------
 r/man/DimReduction.Rd  | 28 ++++++++++++++++++++++++++-
 r/man/IterativeLSI.Rd  |  9 +++++++--
 r/man/LSI.Rd           |  7 ++++++-
 r/man/project.Rd       | 25 ------------------------
 5 files changed, 68 insertions(+), 45 deletions(-)
 delete mode 100644 r/man/project.Rd

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 04290641..435fbe8c 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -209,30 +209,38 @@ select_features_binned_dispersion <- function(
 #' Represents a latent space output of a matrix after a transformation function, with any required information to reproject other inputs using this object.
 #' Child classes should implement a `project` method to allow for the projection of other matrices using
 #' the fitted transformation object.
+#' @rdname DimReduction
+#' @param mat (IterableMatrix) Input matrix of shape `(features x cells)`.
 #' @field cell_embeddings (IterableMatrix, dgCMatrix, matrix) Projected data of shape `(n_dimesions x n_cells)` of the original matrix after a dimensionality reduction.
 #' @field fitted_params (list) A list of parameters used for the transformation of a matrix.  This should include all necessary information to project new data with the same features.
 #' @field feature_names (character) The names of the features that this DimReduction object was fit on.  Matrices to be projected should have the same feature names.
+#' @return - `DimReduction()`: DimReduction object.
 #' @export
-DimReduction <- function(x, fitted_params = list(), ...) {
-  assert_is(x, c("IterableMatrix", "dgCMatrix", "matrix"))
+DimReduction <- function(mat, fitted_params = list(), feature_names = character(0), ...) {
+  assert_is(mat, c("IterableMatrix", "dgCMatrix", "matrix"))
   assert_is(fitted_params, "list")
   structure(list(
-    cell_embeddings = x,
+    cell_embeddings = mat,
     fitted_params = fitted_params,
+    feature_names = feature_names,
     ...
     ),
     class = "DimReduction"
   )
 }
 
-#' Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.
+#' @rdname DimReduction
 #' @param x DimReduction object.
-#' @param mat IterableMatrix object.
-#' @return IterableMatrix object of the projected data.
-#' @details DimReduction subclasses should use this to project new data with the same features, to project into the same latent space.
-#' All required information to run a projection should be held in x$fitted_params, including pertinent parameters when constructing the DimReduction subclass object.
+#' @return - `project()`: IterableMatrix object of the projected data.
+#' @details 
+#' **project()**: Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.
+#' 
+#' DimReduction subclasses should use the `project` method on new data with the same features, to project into the same latent space.
+#' All required information to run a projection should be held in `x$fitted_params`, including pertinent parameters when constructing the DimReduction subclass object.
+#' 
 #' If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features.
 #' If there are rownames, reorder the matrix to match the order of the original matrix
+#' @inheritParams DimReduction
 #' @export
 project <- function(x, mat, ...) {
   UseMethod("project")
@@ -251,6 +259,7 @@ project.default <- function(x, mat, ...) {
 #' 
 #' Given a `(features x cells)` counts matrix, perform LSI, which sequentially executes tf-idf normalization and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
 #' Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space.
+#' @rdname LSI
 #' @param mat (IterableMatrix) Counts matrix of shape `(features x cells)`.
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
 #' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
@@ -258,7 +267,7 @@ project.default <- function(x, mat, ...) {
 #' @param scale_factor (numeric) Scaling factor to multiply matrix by prior to log normalization (see formulas below).
 #' @param threads (integer) Number of threads to use.
 #' @returns An object of class `c("LSI", "DimReduction")` with the following attributes:
-#' - `x`: The projected data as a matrix of shape `(n_dimensions, ncol(mat))`
+#' - `cell_embeddings`: The projected data as a matrix of shape `(n_dimensions, ncol(mat))`
 #' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 #'   - `scale_factor`: The scale factor used for tf-idf normalization
 #'   - `feature_means`: The means of the features used for normalization
@@ -321,14 +330,15 @@ LSI <- function(
     feature_loadings = svd_attr$u
   )
   res <- DimReduction(
-    x = pca_res,
+    mat = pca_res,
     fitted_params = fitted_params,
     feature_names = rownames(mat)
   )
   class(res) <- c("LSI", class(res))
   return(res)
 }
-
+#' @rdname LSI
+#' @inheritParams project
 #' @export
 project.LSI <- function(x, mat, threads = 1L, ...) {
   assert_is_mat(mat)
@@ -366,7 +376,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' Given a `(features x cells)` counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.  
 #' This uses the method described in [ArchR](https://doi.org/10.1038/s41588-021-00790-6) (Granja et al; 2019). 
 #' See details for more specific information.  Returns a DimReduction object, which allows for projection of matrices with the same features into the same latent space.
-#'
+#' @rdname IterativeLSI
 #' @param mat (IterableMatrix) Counts matrix of shape `(features x cells)`.
 #' @param n_iterations (int) The number of LSI iterations to perform.
 #' @param first_feature_selection_method (function) Method to use for selecting features for the first iteration. 
@@ -380,7 +390,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @param lsi_method (function) Method to use for LSI.  Only `LSI` is allowed.  The user can pass in partial parameters to `LSI` to customize the LSI method, 
 #' such as by passing `LSI(n_dimensions = 30, corr_cutoff = 0.5)`.
 #' @return An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
-#' - `x`: The projected data
+#' - `cell_embeddings`: The projected data as a matrix of shape `(n_dimensions, ncol(mat))`
 #' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 #'    - `first_feature_selection_method`: The method used for selecting features for the first iteration
 #'    - `lsi_method`: The method used for LSI
@@ -390,10 +400,10 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #'    - `iter_info`: A tibble with the following columns:
 #'        - `iteration`: The iteration number
 #'        - `feature_names`: The names of the features used for the iteration
-#'        - `feature_loadings`: SVD component u with dimension `(n_dimensions, n_variable_features)`
+#'        - `feature_loadings`: SVD component `u` with dimension `(n_dimensions, n_variable_features)`
 #'        - `clusters`: The clusters for the iteration.  This is blank for the first iteration
 #' @details
-#' The iterative LSI method is as follows:
+#' The Iterative LSI method is as follows:
 #' - First iteration:
 #'    - Select features based on the `first_feature_selection_method` argument
 #'    - Perform LSI on the selected features
@@ -490,13 +500,15 @@ IterativeLSI <- function(
   }
   if (verbose) log_progress("Finished running Iterative LSI")
   res <- DimReduction(
-    x = lsi_res_obj$cell_embeddings,
+    mat = lsi_res_obj$cell_embeddings,
     fitted_params = fitted_params,
     feature_names = rownames(mat)
   )
   class(res) <- c("IterativeLSI", class(res))
   return(res)
 }
+#' @rdname IterativeLSI
+#' @inheritParams project
 #' @export
 project.IterativeLSI <- function(x, mat, threads = 1L, ...) {
   assert_is_mat(mat)
diff --git a/r/man/DimReduction.Rd b/r/man/DimReduction.Rd
index acb74eac..94c8a579 100644
--- a/r/man/DimReduction.Rd
+++ b/r/man/DimReduction.Rd
@@ -2,15 +2,41 @@
 % Please edit documentation in R/singlecell_utils.R
 \name{DimReduction}
 \alias{DimReduction}
+\alias{project}
 \title{Barebones definition of a DimReduction class.}
 \usage{
-DimReduction(x, fitted_params = list(), ...)
+DimReduction(mat, fitted_params = list(), feature_names = character(0), ...)
+
+project(x, mat, ...)
+}
+\arguments{
+\item{mat}{(IterableMatrix) Input matrix of shape \verb{(features x cells)}.}
+
+\item{x}{DimReduction object.}
+}
+\value{
+\itemize{
+\item \code{DimReduction()}: DimReduction object.
+}
+
+\itemize{
+\item \code{project()}: IterableMatrix object of the projected data.
+}
 }
 \description{
 Represents a latent space output of a matrix after a transformation function, with any required information to reproject other inputs using this object.
 Child classes should implement a \code{project} method to allow for the projection of other matrices using
 the fitted transformation object.
 }
+\details{
+\strong{project()}: Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.
+
+DimReduction subclasses should use the \code{project} method on new data with the same features, to project into the same latent space.
+All required information to run a projection should be held in \code{x$fitted_params}, including pertinent parameters when constructing the DimReduction subclass object.
+
+If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features.
+If there are rownames, reorder the matrix to match the order of the original matrix
+}
 \section{Fields}{
 
 \describe{
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index b928e868..ff786dae 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -2,6 +2,7 @@
 % Please edit documentation in R/singlecell_utils.R
 \name{IterativeLSI}
 \alias{IterativeLSI}
+\alias{project.IterativeLSI}
 \title{Run iterative LSI on a matrix.}
 \usage{
 IterativeLSI(
@@ -14,6 +15,8 @@ IterativeLSI(
   threads = 1L,
   verbose = FALSE
 )
+
+\method{project}{IterativeLSI}(x, mat, threads = 1L, ...)
 }
 \arguments{
 \item{mat}{(IterableMatrix) Counts matrix of shape \verb{(features x cells)}.}
@@ -35,11 +38,13 @@ The user can pass in partial parameters to the cluster method, such as by passin
 \code{cluster_graph_leiden(resolution = 0.5, knn_mat_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))}}
 
 \item{threads}{(integer) Number of threads to use.}
+
+\item{x}{DimReduction object.}
 }
 \value{
 An object of class \code{c("IterativeLSI", "DimReduction")} with the following attributes:
 \itemize{
-\item \code{x}: The projected data
+\item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(n_dimensions, ncol(mat))}
 \item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 \itemize{
 \item \code{first_feature_selection_method}: The method used for selecting features for the first iteration
@@ -51,7 +56,7 @@ An object of class \code{c("IterativeLSI", "DimReduction")} with the following a
 \itemize{
 \item \code{iteration}: The iteration number
 \item \code{feature_names}: The names of the features used for the iteration
-\item \code{feature_loadings}: SVD component u with dimension \verb{(n_dimensions, n_variable_features)}
+\item \code{feature_loadings}: SVD component \code{u} with dimension \verb{(n_dimensions, n_variable_features)}
 \item \code{clusters}: The clusters for the iteration.  This is blank for the first iteration
 }
 }
diff --git a/r/man/LSI.Rd b/r/man/LSI.Rd
index 067e2680..db185002 100644
--- a/r/man/LSI.Rd
+++ b/r/man/LSI.Rd
@@ -2,6 +2,7 @@
 % Please edit documentation in R/singlecell_utils.R
 \name{LSI}
 \alias{LSI}
+\alias{project.LSI}
 \title{Perform latent semantic indexing (LSI) on a matrix.}
 \usage{
 LSI(
@@ -12,6 +13,8 @@ LSI(
   threads = 1L,
   verbose = FALSE
 )
+
+\method{project}{LSI}(x, mat, threads = 1L, ...)
 }
 \arguments{
 \item{mat}{(IterableMatrix) Counts matrix of shape \verb{(features x cells)}.}
@@ -24,11 +27,13 @@ the corr_cutoff, it will be excluded from the final PCA matrix.}
 \item{scale_factor}{(numeric) Scaling factor to multiply matrix by prior to log normalization (see formulas below).}
 
 \item{threads}{(integer) Number of threads to use.}
+
+\item{x}{DimReduction object.}
 }
 \value{
 An object of class \code{c("LSI", "DimReduction")} with the following attributes:
 \itemize{
-\item \code{x}: The projected data as a matrix of shape \verb{(n_dimensions, ncol(mat))}
+\item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(n_dimensions, ncol(mat))}
 \item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 \itemize{
 \item \code{scale_factor}: The scale factor used for tf-idf normalization
diff --git a/r/man/project.Rd b/r/man/project.Rd
deleted file mode 100644
index 696294d5..00000000
--- a/r/man/project.Rd
+++ /dev/null
@@ -1,25 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/singlecell_utils.R
-\name{project}
-\alias{project}
-\title{Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.}
-\usage{
-project(x, mat, ...)
-}
-\arguments{
-\item{x}{DimReduction object.}
-
-\item{mat}{IterableMatrix object.}
-}
-\value{
-IterableMatrix object of the projected data.
-}
-\description{
-Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.
-}
-\details{
-DimReduction subclasses should use this to project new data with the same features, to project into the same latent space.
-All required information to run a projection should be held in x$fitted_params, including pertinent parameters when constructing the DimReduction subclass object.
-If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features.
-If there are rownames, reorder the matrix to match the order of the original matrix
-}

From 8ddf94a93161aabc936286fb4c419b7898f0b195 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Tue, 18 Feb 2025 10:14:16 -0800
Subject: [PATCH 086/142] [r] change wrong `missing_args_error` flag on LSI

---
 r/R/singlecell_utils.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 435fbe8c..053baa0a 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -356,7 +356,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
     feature_means = fitted_params$feature_means,
     scale_factor = fitted_params$scale_factor,
     threads = threads,
-    .missing_args_error = TRUE
+    .missing_args_error = FALSE
   )(mat)
   mat <- write_matrix_dir(
     convert_matrix_type(mat, type = "float"),

From b530d13cb1ba380bf9c1c32c700a3a0b7660423a Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Tue, 18 Feb 2025 10:23:48 -0800
Subject: [PATCH 087/142] [r] remove missing_args

---
 r/R/utils.R | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/r/R/utils.R b/r/R/utils.R
index 911c8b64..6cfd2a06 100644
--- a/r/R/utils.R
+++ b/r/R/utils.R
@@ -68,20 +68,15 @@ log_progress <- function(msg, add_timestamp = TRUE){
 #' Automatically creates a partial application of the caller
 #' function including all non-missing arguments. 
 #'
-#' @param missing_args (named list[bool]) Any named index with a TRUE value
-#'   will be treated as missing. Designed to be used in the caller with the
-#'   `base::missing()` function to detect unspecified arguments with default values,
-#'   or to manually specifiy other arguments that should not be specialized
 #' @return A `bpcells_partial` object (a function with some extra attributes)
 #' @keywords internal
-create_partial <- function(missing_args=list()) {
+create_partial <- function() {
   env <- rlang::caller_env()
   fn_sym <- rlang::caller_call()[[1]]
   fn <- rlang::caller_fn()
 
   args <- list()
   for (n in names(formals(fn))) {
-    if (n %in% names(missing_args) && missing_args[[n]]) next
     if (rlang::is_missing(env[[n]])) next
     args[[n]] <- env[[n]]
   }

From e0952de9260014778ef980c01bbfbb48625bc892 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Tue, 18 Feb 2025 10:35:39 -0800
Subject: [PATCH 088/142] [r] remove first_feature_selection argument in
 `IterativeLSI`

---
 r/R/singlecell_utils.R  | 17 +++++++----------
 r/man/IterativeLSI.Rd   | 17 +++++++----------
 r/man/create_partial.Rd |  8 +-------
 3 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 053baa0a..3b9e6466 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -379,8 +379,6 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @rdname IterativeLSI
 #' @param mat (IterableMatrix) Counts matrix of shape `(features x cells)`.
 #' @param n_iterations (int) The number of LSI iterations to perform.
-#' @param first_feature_selection_method (function) Method to use for selecting features for the first iteration. 
-#' Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_mean`, `select_features_binned_dispersion`
 #' @param feature_selection_method (function) Method to use for selecting features for each iteration after the first. 
 #' Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_mean`, `select_features_binned_dispersion`
 #' @param cluster_method (function) Method to use for clustering a kNN matrix. 
@@ -392,7 +390,6 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @return An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data as a matrix of shape `(n_dimensions, ncol(mat))`
 #' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
-#'    - `first_feature_selection_method`: The method used for selecting features for the first iteration
 #'    - `lsi_method`: The method used for LSI
 #'    - `cluster_method`: The method used for clustering
 #'    - `feature_means`: The means of the features used for tf-idf normalization
@@ -405,7 +402,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @details
 #' The Iterative LSI method is as follows:
 #' - First iteration:
-#'    - Select features based on the `first_feature_selection_method` argument
+#'    - Select features based on the `feature_selection_method` argument
 #'    - Perform LSI on the selected features
 #'    - If `n_iterations` is 1, return the projected data from the first PCA projection
 #'    - Else, cluster the LSI results using `cluster_method`
@@ -416,9 +413,11 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #'    - Else, cluster the LSI results using `cluster_method`
 #' 
 #' There are some minor differences when compared to the ArchR implementation.  Firstly, the ArchR implementation uses a different method for selecting features in the first iteration.  
-#' `select_features_mean(normalize = binarize)` can be passed in for the `first_feature_selection_method` argument to mimic the ArchR implementation.
+#' `select_features_mean(normalize = binarize)` can be passed in for the `feature_selection_method` argument to mimic the ArchR implementation, if choosing to only run one iteration.  This function
+#' currently does not support utilization of different feature selection methods across each iteration.  If one desires to use a different feature selection method for each iteration, 
+#' they can take the cluster assignments from the previous iteration and use them to select features and run LSI.
 #' 
-#' Secondly, the ArchR implementation calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
+#' Additionally, the ArchR implementation calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
 #' which BPCells does not encounter even with a non-subsetted matrix.
 #' @seealso `LSI()` `DimReduction()` `knn_hnsw()` `knn_annoy()` 
 #' `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` `select_features_variance()` `select_features_dispersion()` 
@@ -428,10 +427,9 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 IterativeLSI <- function(
   mat, 
   n_iterations = 2,
-  first_feature_selection_method = select_features_binned_dispersion,
   feature_selection_method = select_features_dispersion,
   lsi_method = LSI,
-  cluster_method = cluster_graph_leiden,
+  cluster_method = cluster_graph_louvain,
   threads = 1L, verbose = FALSE
 ) {
   
@@ -442,7 +440,6 @@ IterativeLSI <- function(
   assert_is_wholenumber(threads)
   assert_is(verbose, "logical")
   fitted_params <- list(
-    first_feature_selection_method = first_feature_selection_method,
     lsi_method = lsi_method,
     cluster_method = cluster_method,
     feature_means = matrix_stats(mat, row_stats = "mean", threads = threads)$row_stats["mean",],
@@ -462,7 +459,7 @@ IterativeLSI <- function(
     # run variable feature selection
     if (verbose) log_progress("Selecting features")
     if (i == 1) {
-      variable_features <- partial_apply(first_feature_selection_method, threads = threads, .missing_args_error = FALSE)(mat)
+      variable_features <- partial_apply(feature_selection_method, threads = threads, .missing_args_error = FALSE)(mat)
     } else {
       variable_features <- partial_apply(feature_selection_method, threads = threads, .missing_args_error = FALSE)(pseudobulk_res)
     }
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index ff786dae..dfb43c35 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -8,10 +8,9 @@
 IterativeLSI(
   mat,
   n_iterations = 2,
-  first_feature_selection_method = select_features_binned_dispersion,
   feature_selection_method = select_features_dispersion,
   lsi_method = LSI,
-  cluster_method = cluster_graph_leiden,
+  cluster_method = cluster_graph_louvain,
   threads = 1L,
   verbose = FALSE
 )
@@ -23,9 +22,6 @@ IterativeLSI(
 
 \item{n_iterations}{(int) The number of LSI iterations to perform.}
 
-\item{first_feature_selection_method}{(function) Method to use for selecting features for the first iteration.
-Current builtin options are \code{select_features_variance}, \code{select_features_dispersion}, \code{select_features_mean}, \code{select_features_binned_dispersion}}
-
 \item{feature_selection_method}{(function) Method to use for selecting features for each iteration after the first.
 Current builtin options are \code{select_features_variance}, \code{select_features_dispersion}, \code{select_features_mean}, \code{select_features_binned_dispersion}}
 
@@ -47,7 +43,6 @@ An object of class \code{c("IterativeLSI", "DimReduction")} with the following a
 \item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(n_dimensions, ncol(mat))}
 \item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 \itemize{
-\item \code{first_feature_selection_method}: The method used for selecting features for the first iteration
 \item \code{lsi_method}: The method used for LSI
 \item \code{cluster_method}: The method used for clustering
 \item \code{feature_means}: The means of the features used for tf-idf normalization
@@ -68,11 +63,11 @@ This uses the method described in \href{https://doi.org/10.1038/s41588-021-00790
 See details for more specific information.  Returns a DimReduction object, which allows for projection of matrices with the same features into the same latent space.
 }
 \details{
-The iterative LSI method is as follows:
+The Iterative LSI method is as follows:
 \itemize{
 \item First iteration:
 \itemize{
-\item Select features based on the \code{first_feature_selection_method} argument
+\item Select features based on the \code{feature_selection_method} argument
 \item Perform LSI on the selected features
 \item If \code{n_iterations} is 1, return the projected data from the first PCA projection
 \item Else, cluster the LSI results using \code{cluster_method}
@@ -87,9 +82,11 @@ The iterative LSI method is as follows:
 }
 
 There are some minor differences when compared to the ArchR implementation.  Firstly, the ArchR implementation uses a different method for selecting features in the first iteration.
-\code{select_features_mean(normalize = binarize)} can be passed in for the \code{first_feature_selection_method} argument to mimic the ArchR implementation.
+\code{select_features_mean(normalize = binarize)} can be passed in for the \code{feature_selection_method} argument to mimic the ArchR implementation, if choosing to only run one iteration.  This function
+currently does not support utilization of different feature selection methods across each iteration.  If one desires to use a different feature selection method for each iteration,
+they can take the cluster assignments from the previous iteration and use them to select features and run LSI.
 
-Secondly, the ArchR implementation calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
+Additionally, the ArchR implementation calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
 which BPCells does not encounter even with a non-subsetted matrix.
 }
 \seealso{
diff --git a/r/man/create_partial.Rd b/r/man/create_partial.Rd
index fe92fd5a..b8bc6939 100644
--- a/r/man/create_partial.Rd
+++ b/r/man/create_partial.Rd
@@ -4,13 +4,7 @@
 \alias{create_partial}
 \title{Helper to create partial functions}
 \usage{
-create_partial(missing_args = list())
-}
-\arguments{
-\item{missing_args}{(named list\link{bool}) Any named index with a TRUE value
-will be treated as missing. Designed to be used in the caller with the
-\code{base::missing()} function to detect unspecified arguments with default values,
-or to manually specifiy other arguments that should not be specialized}
+create_partial()
 }
 \value{
 A \code{bpcells_partial} object (a function with some extra attributes)

From c57945e550a42dc0036a3354df6f7993d20d9039 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Tue, 18 Feb 2025 10:40:44 -0800
Subject: [PATCH 089/142] [r] revert removal of `.overwrite` in
 `partial_apply()`

---
 r/R/utils.R            | 7 +++++--
 r/man/partial_apply.Rd | 6 +++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/r/R/utils.R b/r/R/utils.R
index 6cfd2a06..b27fea00 100644
--- a/r/R/utils.R
+++ b/r/R/utils.R
@@ -93,18 +93,21 @@ create_partial <- function() {
 #'
 #' @param f A function
 #' @param ... Named arguments to `f`
+#' @param .overwrite (bool) If `f` is already an output from
+#'   `partial_apply()`, whether parameter re-definitions should
+#'   be ignored or overwrite the existing definitions
 #' @param .missing_args_error (bool) If `TRUE`, passing in arguments
 #'  that are not in the function's signature will raise an error, otherwise
 #'  they will be ignored
 #' @return A `bpcells_partial` object (a function with some extra attributes)
 #' @keywords internal
-partial_apply <- function(f, ..., .missing_args_error = TRUE) {
+partial_apply <- function(f, ..., .overwrite = TRUE, .missing_args_error = TRUE) {
   args <- rlang::list2(...)
 
   if (is(f, "bpcells_partial")) {
     prev_args <- attr(f, "args")
     for (a in names(prev_args)) {
-      if (!(a %in% names(args))) {
+      if (!(.overwrite && a %in% names(args))) {
         args[[a]] <- prev_args[[a]]
       }
     }
diff --git a/r/man/partial_apply.Rd b/r/man/partial_apply.Rd
index 68cfdde2..149eb19f 100644
--- a/r/man/partial_apply.Rd
+++ b/r/man/partial_apply.Rd
@@ -4,13 +4,17 @@
 \alias{partial_apply}
 \title{Create partial function calls}
 \usage{
-partial_apply(f, ..., .missing_args_error = TRUE)
+partial_apply(f, ..., .overwrite = TRUE, .missing_args_error = TRUE)
 }
 \arguments{
 \item{f}{A function}
 
 \item{...}{Named arguments to \code{f}}
 
+\item{.overwrite}{(bool) If \code{f} is already an output from
+\code{partial_apply()}, whether parameter re-definitions should
+be ignored or overwrite the existing definitions}
+
 \item{.missing_args_error}{(bool) If \code{TRUE}, passing in arguments
 that are not in the function's signature will raise an error, otherwise
 they will be ignored}

From 11fec1ae1e4e7a58456887ddb4f6c6304810abcd Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Tue, 18 Feb 2025 10:49:21 -0800
Subject: [PATCH 090/142] [r] cleanup lsi, iterative lsi docs

---
 r/R/singlecell_utils.R | 10 ++++++++--
 r/man/IterativeLSI.Rd  |  4 +++-
 r/man/LSI.Rd           |  4 +++-
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 3b9e6466..bcfab039 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -266,7 +266,8 @@ project.default <- function(x, mat, ...) {
 #' the corr_cutoff, it will be excluded from the final PCA matrix.
 #' @param scale_factor (numeric) Scaling factor to multiply matrix by prior to log normalization (see formulas below).
 #' @param threads (integer) Number of threads to use.
-#' @returns An object of class `c("LSI", "DimReduction")` with the following attributes:
+#' @returns 
+#' **LSI()** An object of class `c("LSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data as a matrix of shape `(n_dimensions, ncol(mat))`
 #' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 #'   - `scale_factor`: The scale factor used for tf-idf normalization
@@ -338,6 +339,8 @@ LSI <- function(
   return(res)
 }
 #' @rdname LSI
+#' @return 
+#' **project()** IterableMatrix of the projected data of shape `(n_dimensions, ncol(mat))`.
 #' @inheritParams project
 #' @export
 project.LSI <- function(x, mat, threads = 1L, ...) {
@@ -387,7 +390,8 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' `cluster_graph_leiden(resolution = 0.5, knn_mat_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))`
 #' @param lsi_method (function) Method to use for LSI.  Only `LSI` is allowed.  The user can pass in partial parameters to `LSI` to customize the LSI method, 
 #' such as by passing `LSI(n_dimensions = 30, corr_cutoff = 0.5)`.
-#' @return An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
+#' @return 
+#' **IterativeLSI()** An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data as a matrix of shape `(n_dimensions, ncol(mat))`
 #' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 #'    - `lsi_method`: The method used for LSI
@@ -505,6 +509,8 @@ IterativeLSI <- function(
   return(res)
 }
 #' @rdname IterativeLSI
+#' @return
+#' **project()** IterableMatrix of the projected data of shape `(n_dimensions, ncol(mat))`.
 #' @inheritParams project
 #' @export
 project.IterativeLSI <- function(x, mat, threads = 1L, ...) {
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index dfb43c35..0c8888cb 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -38,7 +38,7 @@ The user can pass in partial parameters to the cluster method, such as by passin
 \item{x}{DimReduction object.}
 }
 \value{
-An object of class \code{c("IterativeLSI", "DimReduction")} with the following attributes:
+\strong{IterativeLSI()} An object of class \code{c("IterativeLSI", "DimReduction")} with the following attributes:
 \itemize{
 \item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(n_dimensions, ncol(mat))}
 \item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
@@ -56,6 +56,8 @@ An object of class \code{c("IterativeLSI", "DimReduction")} with the following a
 }
 }
 }
+
+\strong{project()} IterableMatrix of the projected data of shape \verb{(n_dimensions, ncol(mat))}.
 }
 \description{
 Given a \verb{(features x cells)} counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
diff --git a/r/man/LSI.Rd b/r/man/LSI.Rd
index db185002..f41ebf4d 100644
--- a/r/man/LSI.Rd
+++ b/r/man/LSI.Rd
@@ -31,7 +31,7 @@ the corr_cutoff, it will be excluded from the final PCA matrix.}
 \item{x}{DimReduction object.}
 }
 \value{
-An object of class \code{c("LSI", "DimReduction")} with the following attributes:
+\strong{LSI()} An object of class \code{c("LSI", "DimReduction")} with the following attributes:
 \itemize{
 \item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(n_dimensions, ncol(mat))}
 \item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
@@ -43,6 +43,8 @@ An object of class \code{c("LSI", "DimReduction")} with the following attributes
 }
 \item \code{feature_names}: The names of the features in the matrix
 }
+
+\strong{project()} IterableMatrix of the projected data of shape \verb{(n_dimensions, ncol(mat))}.
 }
 \description{
 Given a \verb{(features x cells)} counts matrix, perform LSI, which sequentially executes tf-idf normalization and PCA to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.

From 882d74df5be48a912e4a1cc078c59eab19f7c7c3 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Tue, 18 Feb 2025 14:16:12 -0800
Subject: [PATCH 091/142] [r] change default feature selection for
 `IterativeLSI()` to var fix up docs for LSI, IterativeLSI

---
 r/R/singlecell_utils.R | 31 +++++++++++++++----------------
 r/man/IterativeLSI.Rd  |  4 ++--
 r/man/LSI.Rd           |  4 ++--
 3 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index bcfab039..013ff0e6 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -256,9 +256,9 @@ project.default <- function(x, mat, ...) {
 
 
 #' Perform latent semantic indexing (LSI) on a matrix.
-#' 
+#'
 #' Given a `(features x cells)` counts matrix, perform LSI, which sequentially executes tf-idf normalization and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
-#' Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space.
+#' Returns a DimReduction object, which contains the projected matrix and also allows for projection of new matrices with the same features into the same latent space.
 #' @rdname LSI
 #' @param mat (IterableMatrix) Counts matrix of shape `(features x cells)`.
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
@@ -269,14 +269,14 @@ project.default <- function(x, mat, ...) {
 #' @returns 
 #' **LSI()** An object of class `c("LSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data as a matrix of shape `(n_dimensions, ncol(mat))`
-#' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
+#' - `fitted_params`: A list of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 #'   - `scale_factor`: The scale factor used for tf-idf normalization
 #'   - `feature_means`: The means of the features used for normalization
 #'   - `pcs_to_keep`: The PCs that were kept after filtering by correlation to sequencing depth
 #'   - `feature_loadings`: SVD component u with dimension `(n_variable_features, n_dimensions)`
 #' - `feature_names`: The names of the features in the matrix
 #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
-#' 
+#'
 #' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
 #' - 17.1 MB memory usage, 25.1 seconds runtime
 #' @seealso `project()` `DimReduction()` `normalize_tfidf()` 
@@ -375,7 +375,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 
 
 #' Run iterative LSI on a matrix.
-#' 
+#'
 #' Given a `(features x cells)` counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.  
 #' This uses the method described in [ArchR](https://doi.org/10.1038/s41588-021-00790-6) (Granja et al; 2019). 
 #' See details for more specific information.  Returns a DimReduction object, which allows for projection of matrices with the same features into the same latent space.
@@ -384,7 +384,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @param n_iterations (int) The number of LSI iterations to perform.
 #' @param feature_selection_method (function) Method to use for selecting features for each iteration after the first. 
 #' Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_mean`, `select_features_binned_dispersion`
-#' @param cluster_method (function) Method to use for clustering a kNN matrix. 
+#' @param cluster_method (function) Method to use for clustering a kNN matrix.
 #' Current builtin options are `cluster_graph_{leiden, louvain, seurat}()`.
 #' The user can pass in partial parameters to the cluster method, such as by passing 
 #' `cluster_graph_leiden(resolution = 0.5, knn_mat_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))`
@@ -406,7 +406,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @details
 #' The Iterative LSI method is as follows:
 #' - First iteration:
-#'    - Select features based on the `feature_selection_method` argument
+#'    - Select features using `feature_selection_method` 
 #'    - Perform LSI on the selected features
 #'    - If `n_iterations` is 1, return the projected data from the first PCA projection
 #'    - Else, cluster the LSI results using `cluster_method`
@@ -415,28 +415,27 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #'    - Perform LSI on the selected features
 #'    - If this is the final iteration, return the projected data from this PCA projection
 #'    - Else, cluster the LSI results using `cluster_method`
-#' 
+#'
 #' There are some minor differences when compared to the ArchR implementation.  Firstly, the ArchR implementation uses a different method for selecting features in the first iteration.  
 #' `select_features_mean(normalize = binarize)` can be passed in for the `feature_selection_method` argument to mimic the ArchR implementation, if choosing to only run one iteration.  This function
 #' currently does not support utilization of different feature selection methods across each iteration.  If one desires to use a different feature selection method for each iteration, 
 #' they can take the cluster assignments from the previous iteration and use them to select features and run LSI.
-#' 
+#'
 #' Additionally, the ArchR implementation calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
 #' which BPCells does not encounter even with a non-subsetted matrix.
-#' @seealso `LSI()` `DimReduction()` `knn_hnsw()` `knn_annoy()` 
-#' `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` `select_features_variance()` `select_features_dispersion()` 
+#' @seealso `LSI()` `DimReduction()` `knn_hnsw()` `knn_annoy()`
+#' `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` `select_features_variance()` `select_features_dispersion()`
 #' `select_features_mean()` `select_features_binned_dispersion()`
 #' @inheritParams LSI
 #' @export
 IterativeLSI <- function(
-  mat, 
+  mat,
   n_iterations = 2,
-  feature_selection_method = select_features_dispersion,
+  feature_selection_method = select_features_variance,
   lsi_method = LSI,
   cluster_method = cluster_graph_louvain,
   threads = 1L, verbose = FALSE
-) {
-  
+) {  
   assert_has_package("RcppHNSW")
   assert_is_mat(mat)
   assert_greater_than_zero(n_iterations)
@@ -444,6 +443,7 @@ IterativeLSI <- function(
   assert_is_wholenumber(threads)
   assert_is(verbose, "logical")
   fitted_params <- list(
+    feature_selection_method = feature_selection_method,
     lsi_method = lsi_method,
     cluster_method = cluster_method,
     feature_means = matrix_stats(mat, row_stats = "mean", threads = threads)$row_stats["mean",],
@@ -468,7 +468,6 @@ IterativeLSI <- function(
       variable_features <- partial_apply(feature_selection_method, threads = threads, .missing_args_error = FALSE)(pseudobulk_res)
     }
     fitted_params$iter_info$feature_names[[i]] <- variable_features %>% dplyr::filter(highly_variable) %>% dplyr::pull(feature)
-    
     if (is.character(fitted_params$iter_info$feature_names[[i]])) {
       mat_indices <- which(rownames(mat) %in% fitted_params$iter_info$feature_names[[i]])
     } else {
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index 0c8888cb..53480b58 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -8,7 +8,7 @@
 IterativeLSI(
   mat,
   n_iterations = 2,
-  feature_selection_method = select_features_dispersion,
+  feature_selection_method = select_features_variance,
   lsi_method = LSI,
   cluster_method = cluster_graph_louvain,
   threads = 1L,
@@ -69,7 +69,7 @@ The Iterative LSI method is as follows:
 \itemize{
 \item First iteration:
 \itemize{
-\item Select features based on the \code{feature_selection_method} argument
+\item Select features using \code{feature_selection_method}
 \item Perform LSI on the selected features
 \item If \code{n_iterations} is 1, return the projected data from the first PCA projection
 \item Else, cluster the LSI results using \code{cluster_method}
diff --git a/r/man/LSI.Rd b/r/man/LSI.Rd
index f41ebf4d..36bf479f 100644
--- a/r/man/LSI.Rd
+++ b/r/man/LSI.Rd
@@ -34,7 +34,7 @@ the corr_cutoff, it will be excluded from the final PCA matrix.}
 \strong{LSI()} An object of class \code{c("LSI", "DimReduction")} with the following attributes:
 \itemize{
 \item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(n_dimensions, ncol(mat))}
-\item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
+\item \code{fitted_params}: A list of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 \itemize{
 \item \code{scale_factor}: The scale factor used for tf-idf normalization
 \item \code{feature_means}: The means of the features used for normalization
@@ -48,7 +48,7 @@ the corr_cutoff, it will be excluded from the final PCA matrix.}
 }
 \description{
 Given a \verb{(features x cells)} counts matrix, perform LSI, which sequentially executes tf-idf normalization and PCA to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
-Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space.
+Returns a DimReduction object, which contains the projected matrix and also allows for projection of new matrices with the same features into the same latent space.
 }
 \details{
 Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.

From 732c800b208257757bf01c13b526c3ec3e89bd7f Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Tue, 18 Feb 2025 17:49:54 -0800
Subject: [PATCH 092/142] [r] fill out `is_knn_matrix()` docs

---
 r/R/clustering.R                       |  8 ++++----
 r/man/convert_mat_to_cluster_matrix.Rd | 15 ++++++++-------
 r/man/is_knn_matrix.Rd                 |  2 +-
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/r/R/clustering.R b/r/R/clustering.R
index efa629d1..a4fb9f5e 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -11,7 +11,7 @@
 #'
 #' knn matrix functions `knn_hnsw()` and `knn_annoy()` return a list of two matrices, `idx` and `dist`.
 #' These are used as inputs to create graph adjacency matrices for clustering.
-#' Assume any list with at least an 
+#' Assume any list with both `idx` and `dist` is a kNN object.
 #' @return TRUE if the mat is a kNN matrix, FALSE otherwise
 #' @keywords internal
 is_knn_matrix <- function(mat) {
@@ -41,11 +41,11 @@ is_adjacency_matrix <- function(mat) {
 #' @param knn_graph_method (function) Function to convert a knn matrix to a graph matrix
 #'   (e.g., `knn_to_geodesic_graph`). Ignored if `required_mat_type` is "knn" or if
 #'   `mat` is already a graph matrix.
-#' #' @details
+#' @details
 #' This function checks the type of the input matrix `mat`. `mat` is returned without modification if
 #' it is already the required type (adjacency or knn).
 #' If `mat` is a standard matrix and `required_mat_type` is "knn", then `knn_mat_method` is used to convert `mat` to a knn matrix.
-#' If `required_mat_type` is "adjacency", then `knn_mat_method` is used to first convert `mat` to a knn matrix, 
+#' If `required_mat_type` is "adjacency", then `knn_mat_method` is used to first convert `mat` to a knn matrix,
 #' then `knn_graph_method` is used to convert the knn matrix to an adjacency matrix.
 #' @return The converted matrix.
 #' @keywords internal
@@ -190,7 +190,7 @@ knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=
   return_type <- match.arg(return_type)
   assert_is_wholenumber(threads)
   if (rlang::is_missing(knn)) return(create_partial())
-  knn_mat_method <- partial_apply(knn_mat_method, threads = threads, , .missing_args_error = FALSE)
+  knn_mat_method <- partial_apply(knn_mat_method, threads = threads, .missing_args_error = FALSE)
   knn <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_mat_method = knn_mat_method)
   graph <- build_umap_graph_cpp(knn$dist, knn$idx, threads=threads)
   
diff --git a/r/man/convert_mat_to_cluster_matrix.Rd b/r/man/convert_mat_to_cluster_matrix.Rd
index 9e42e331..1842dc9e 100644
--- a/r/man/convert_mat_to_cluster_matrix.Rd
+++ b/r/man/convert_mat_to_cluster_matrix.Rd
@@ -21,13 +21,7 @@ Ignored if \code{mat} is already a knn or graph matrix.}
 
 \item{knn_graph_method}{(function) Function to convert a knn matrix to a graph matrix
 (e.g., \code{knn_to_geodesic_graph}). Ignored if \code{required_mat_type} is "knn" or if
-\code{mat} is already a graph matrix.
-#' @details
-This function checks the type of the input matrix \code{mat}. \code{mat} is returned without modification if
-it is already the required type (adjacency or knn).
-If \code{mat} is a standard matrix and \code{required_mat_type} is "knn", then \code{knn_mat_method} is used to convert \code{mat} to a knn matrix.
-If \code{required_mat_type} is "adjacency", then \code{knn_mat_method} is used to first convert \code{mat} to a knn matrix,
-then \code{knn_graph_method} is used to convert the knn matrix to an adjacency matrix.}
+\code{mat} is already a graph matrix.}
 }
 \value{
 The converted matrix.
@@ -37,4 +31,11 @@ Ensures that the input matrix is converted to the correct type (knn or adjacency
 required by a clustering function. If the input is already of the correct type,
 it is returned as is.
 }
+\details{
+This function checks the type of the input matrix \code{mat}. \code{mat} is returned without modification if
+it is already the required type (adjacency or knn).
+If \code{mat} is a standard matrix and \code{required_mat_type} is "knn", then \code{knn_mat_method} is used to convert \code{mat} to a knn matrix.
+If \code{required_mat_type} is "adjacency", then \code{knn_mat_method} is used to first convert \code{mat} to a knn matrix,
+then \code{knn_graph_method} is used to convert the knn matrix to an adjacency matrix.
+}
 \keyword{internal}
diff --git a/r/man/is_knn_matrix.Rd b/r/man/is_knn_matrix.Rd
index 43a1dac9..fc84877a 100644
--- a/r/man/is_knn_matrix.Rd
+++ b/r/man/is_knn_matrix.Rd
@@ -12,6 +12,6 @@ TRUE if the mat is a kNN matrix, FALSE otherwise
 \description{
 knn matrix functions \code{knn_hnsw()} and \code{knn_annoy()} return a list of two matrices, \code{idx} and \code{dist}.
 These are used as inputs to create graph adjacency matrices for clustering.
-Assume any list with at least an
+Assume any list with both \code{idx} and \code{dist} is a kNN object.
 }
 \keyword{internal}

From a5b200b14f43679832ccddb315004aa1620c6931 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 19 Feb 2025 16:03:04 -0800
Subject: [PATCH 093/142] [r] make feature selection use `normalize_method`
 argument

---
 r/R/singlecell_utils.R                   | 43 ++++++++++++------------
 r/man/IterativeLSI.Rd                    |  4 +--
 r/man/LSI.Rd                             |  4 +--
 r/man/feature_selection.Rd               | 15 ++++++---
 r/tests/testthat/test-singlecell_utils.R |  6 ++--
 5 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 013ff0e6..f1dd04dc 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -19,7 +19,7 @@
 #' @rdname feature_selection
 #' @param mat (IterableMatrix) dimensions features x cells
 #' @param num_feats (float) Number of features to mark as highly_variable. If 0 < num_feats < 1, then interpret it as a fraction of features.
-#' @param normalize (function) Normalize the matrix prior to feature selection by calling normalize(mat) if it's not NULL. 
+#' @param normalize_method (function) Normalize the matrix prior to feature selection by calling normalize(mat) if it's not NULL. 
 #' For example, pass normalize_log() or normalize_tfidf(). 
 #' If the normalize function accepts a threads argument, that will passed as normalize(mat, threads=threads).
 #' @param threads (integer) Number of threads to use.
@@ -42,7 +42,7 @@
 #' select_features_variance(
 #'     mat, 
 #'     num_feats=2, 
-#'     normalize=normalize_log
+#'     normalize_method=normalize_log
 #' )
 #' 
 #' # Because of how the BPCells `normalize` functions behave when the matrix 
@@ -50,7 +50,7 @@
 #' variable_features <- select_features_variance(
 #'     mat,
 #'     num_feats=2,
-#'     normalize=normalize_log(scale_factor=20)
+#'     normalize_method=normalize_log(scale_factor=20)
 #' ) 
 #' # One can then filter to only variable features using the subset operator:
 #' mat[variable_features$feature[variable_features$highly_variable],]
@@ -58,7 +58,7 @@
 #' @export
 select_features_variance <- function(
   mat, num_feats = 0.05, 
-  normalize = NULL,
+  normalize_method = NULL,
   threads = 1L
 ) {
   assert_greater_than_zero(num_feats)
@@ -72,7 +72,7 @@ select_features_variance <- function(
     rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat))))
   }
   num_feats <- min(max(num_feats, 0), nrow(mat))
-  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads, .missing_args_error = FALSE)(mat)
+  if (!is.null(normalize_method)) mat <- partial_apply(normalize_method, threads = threads, .missing_args_error = FALSE)(mat)
   features_df <- tibble::tibble(
     feature = rownames(mat),
     score = matrix_stats(mat, row_stats = "variance", threads = threads)$row_stats["variance",]
@@ -88,7 +88,7 @@ select_features_variance <- function(
 #' @export
 select_features_dispersion <- function(
   mat, num_feats = 0.05, 
-  normalize = NULL,
+  normalize_method = NULL,
   threads = 1L
 ) {
   assert_greater_than_zero(num_feats)
@@ -102,7 +102,7 @@ select_features_dispersion <- function(
   num_feats <- min(max(num_feats, 0), nrow(mat))
   if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
   assert_is(mat, "IterableMatrix")
-  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads, .missing_args_error = FALSE)(mat)
+  if (!is.null(normalize_method)) mat <- partial_apply(normalize_method, threads = threads, .missing_args_error = FALSE)(mat)
   mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads)
   features_df <- tibble::tibble(
     feature = rownames(mat),
@@ -256,9 +256,9 @@ project.default <- function(x, mat, ...) {
 
 
 #' Perform latent semantic indexing (LSI) on a matrix.
-#'
+#' 
 #' Given a `(features x cells)` counts matrix, perform LSI, which sequentially executes tf-idf normalization and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
-#' Returns a DimReduction object, which contains the projected matrix and also allows for projection of new matrices with the same features into the same latent space.
+#' Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space.
 #' @rdname LSI
 #' @param mat (IterableMatrix) Counts matrix of shape `(features x cells)`.
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
@@ -269,14 +269,14 @@ project.default <- function(x, mat, ...) {
 #' @returns 
 #' **LSI()** An object of class `c("LSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data as a matrix of shape `(n_dimensions, ncol(mat))`
-#' - `fitted_params`: A list of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
+#' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 #'   - `scale_factor`: The scale factor used for tf-idf normalization
 #'   - `feature_means`: The means of the features used for normalization
 #'   - `pcs_to_keep`: The PCs that were kept after filtering by correlation to sequencing depth
 #'   - `feature_loadings`: SVD component u with dimension `(n_variable_features, n_dimensions)`
 #' - `feature_names`: The names of the features in the matrix
 #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
-#'
+#' 
 #' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
 #' - 17.1 MB memory usage, 25.1 seconds runtime
 #' @seealso `project()` `DimReduction()` `normalize_tfidf()` 
@@ -375,7 +375,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 
 
 #' Run iterative LSI on a matrix.
-#'
+#' 
 #' Given a `(features x cells)` counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.  
 #' This uses the method described in [ArchR](https://doi.org/10.1038/s41588-021-00790-6) (Granja et al; 2019). 
 #' See details for more specific information.  Returns a DimReduction object, which allows for projection of matrices with the same features into the same latent space.
@@ -384,7 +384,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @param n_iterations (int) The number of LSI iterations to perform.
 #' @param feature_selection_method (function) Method to use for selecting features for each iteration after the first. 
 #' Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_mean`, `select_features_binned_dispersion`
-#' @param cluster_method (function) Method to use for clustering a kNN matrix.
+#' @param cluster_method (function) Method to use for clustering a kNN matrix. 
 #' Current builtin options are `cluster_graph_{leiden, louvain, seurat}()`.
 #' The user can pass in partial parameters to the cluster method, such as by passing 
 #' `cluster_graph_leiden(resolution = 0.5, knn_mat_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))`
@@ -406,7 +406,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @details
 #' The Iterative LSI method is as follows:
 #' - First iteration:
-#'    - Select features using `feature_selection_method` 
+#'    - Select features based on the `feature_selection_method` argument
 #'    - Perform LSI on the selected features
 #'    - If `n_iterations` is 1, return the projected data from the first PCA projection
 #'    - Else, cluster the LSI results using `cluster_method`
@@ -415,27 +415,27 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #'    - Perform LSI on the selected features
 #'    - If this is the final iteration, return the projected data from this PCA projection
 #'    - Else, cluster the LSI results using `cluster_method`
-#'
+#' 
 #' There are some minor differences when compared to the ArchR implementation.  Firstly, the ArchR implementation uses a different method for selecting features in the first iteration.  
 #' `select_features_mean(normalize = binarize)` can be passed in for the `feature_selection_method` argument to mimic the ArchR implementation, if choosing to only run one iteration.  This function
 #' currently does not support utilization of different feature selection methods across each iteration.  If one desires to use a different feature selection method for each iteration, 
 #' they can take the cluster assignments from the previous iteration and use them to select features and run LSI.
-#'
+#' 
 #' Additionally, the ArchR implementation calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
 #' which BPCells does not encounter even with a non-subsetted matrix.
-#' @seealso `LSI()` `DimReduction()` `knn_hnsw()` `knn_annoy()`
-#' `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` `select_features_variance()` `select_features_dispersion()`
+#' @seealso `LSI()` `DimReduction()` `knn_hnsw()` `knn_annoy()` 
+#' `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` `select_features_variance()` `select_features_dispersion()` 
 #' `select_features_mean()` `select_features_binned_dispersion()`
 #' @inheritParams LSI
 #' @export
 IterativeLSI <- function(
-  mat,
+  mat, 
   n_iterations = 2,
   feature_selection_method = select_features_variance,
   lsi_method = LSI,
-  cluster_method = cluster_graph_louvain,
+  cluster_method = cluster_graph_leiden,
   threads = 1L, verbose = FALSE
-) {  
+) {
   assert_has_package("RcppHNSW")
   assert_is_mat(mat)
   assert_greater_than_zero(n_iterations)
@@ -443,7 +443,6 @@ IterativeLSI <- function(
   assert_is_wholenumber(threads)
   assert_is(verbose, "logical")
   fitted_params <- list(
-    feature_selection_method = feature_selection_method,
     lsi_method = lsi_method,
     cluster_method = cluster_method,
     feature_means = matrix_stats(mat, row_stats = "mean", threads = threads)$row_stats["mean",],
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index 53480b58..939d7e83 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -10,7 +10,7 @@ IterativeLSI(
   n_iterations = 2,
   feature_selection_method = select_features_variance,
   lsi_method = LSI,
-  cluster_method = cluster_graph_louvain,
+  cluster_method = cluster_graph_leiden,
   threads = 1L,
   verbose = FALSE
 )
@@ -69,7 +69,7 @@ The Iterative LSI method is as follows:
 \itemize{
 \item First iteration:
 \itemize{
-\item Select features using \code{feature_selection_method}
+\item Select features based on the \code{feature_selection_method} argument
 \item Perform LSI on the selected features
 \item If \code{n_iterations} is 1, return the projected data from the first PCA projection
 \item Else, cluster the LSI results using \code{cluster_method}
diff --git a/r/man/LSI.Rd b/r/man/LSI.Rd
index 36bf479f..f41ebf4d 100644
--- a/r/man/LSI.Rd
+++ b/r/man/LSI.Rd
@@ -34,7 +34,7 @@ the corr_cutoff, it will be excluded from the final PCA matrix.}
 \strong{LSI()} An object of class \code{c("LSI", "DimReduction")} with the following attributes:
 \itemize{
 \item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(n_dimensions, ncol(mat))}
-\item \code{fitted_params}: A list of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
+\item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 \itemize{
 \item \code{scale_factor}: The scale factor used for tf-idf normalization
 \item \code{feature_means}: The means of the features used for normalization
@@ -48,7 +48,7 @@ the corr_cutoff, it will be excluded from the final PCA matrix.}
 }
 \description{
 Given a \verb{(features x cells)} counts matrix, perform LSI, which sequentially executes tf-idf normalization and PCA to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
-Returns a DimReduction object, which contains the projected matrix and also allows for projection of new matrices with the same features into the same latent space.
+Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space.
 }
 \details{
 Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index c2467bc6..730aa1a8 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -7,12 +7,17 @@
 \alias{select_features_binned_dispersion}
 \title{Feature selection functions}
 \usage{
-select_features_variance(mat, num_feats = 0.05, normalize = NULL, threads = 1L)
+select_features_variance(
+  mat,
+  num_feats = 0.05,
+  normalize_method = NULL,
+  threads = 1L
+)
 
 select_features_dispersion(
   mat,
   num_feats = 0.05,
-  normalize = NULL,
+  normalize_method = NULL,
   threads = 1L
 )
 
@@ -30,7 +35,7 @@ select_features_binned_dispersion(
 
 \item{num_feats}{(float) Number of features to mark as highly_variable. If 0 < num_feats < 1, then interpret it as a fraction of features.}
 
-\item{normalize}{(function) Normalize the matrix prior to feature selection by calling normalize(mat) if it's not NULL.
+\item{normalize_method}{(function) Normalize the matrix prior to feature selection by calling normalize(mat) if it's not NULL.
 For example, pass normalize_log() or normalize_tfidf().
 If the normalize function accepts a threads argument, that will passed as normalize(mat, threads=threads).}
 
@@ -92,7 +97,7 @@ mat <- as(mat, "IterableMatrix")
 select_features_variance(
     mat, 
     num_feats=2, 
-    normalize=normalize_log
+    normalize_method=normalize_log
 )
 
 # Because of how the BPCells `normalize` functions behave when the matrix 
@@ -100,7 +105,7 @@ select_features_variance(
 variable_features <- select_features_variance(
     mat,
     num_feats=2,
-    normalize=normalize_log(scale_factor=20)
+    normalize_method=normalize_log(scale_factor=20)
 ) 
 # One can then filter to only variable features using the subset operator:
 mat[variable_features$feature[variable_features$highly_variable],]
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 833d11d7..61cb7f54 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -35,10 +35,10 @@ test_that("select_features works general case", {
     expect_identical(res_more_feats_than_rows, res_feats_equal_rows)
     if (fn == "select_features_variance") {
       # Check that normalization actually does something
-      res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL))
+      res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize_method = NULL))
       # Check that we can do partial functions on normalization too
-      res_norm_partial <- do.call(fn, list(m1, num_feats = 10, normalize = normalize_log(scale = 1e3, threads = 1L)))
-      res_norm_implicit_partial <- select_features_variance(normalize = normalize_log(scale_factor = 1e3), num_feats = 10)(m1)
+      res_norm_partial <- do.call(fn, list(m1, num_feats = 10, normalize_method = normalize_log(scale = 1e3, threads = 1L)))
+      res_norm_implicit_partial <- select_features_variance(normalize_method = normalize_log(scale_factor = 1e3), num_feats = 10)(m1)
       expect_identical(res_norm_partial, res_norm_implicit_partial)
       expect_true(!all((res_no_norm %>% dplyr::arrange(feature))$score == (res_norm_partial %>% dplyr::arrange(feature))$score))
     }

From f1104bfaaf186456779ee789d35e85d1a99c0d27 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 19 Feb 2025 16:42:04 -0800
Subject: [PATCH 094/142] [r] fix removal of pcs in `IterativeLSI()`,
 documentation

---
 r/R/singlecell_utils.R | 36 ++++++++++++++++++------------------
 r/man/IterativeLSI.Rd  |  5 +++--
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index f1dd04dc..967c528b 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -393,20 +393,21 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @return 
 #' **IterativeLSI()** An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data as a matrix of shape `(n_dimensions, ncol(mat))`
-#' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
+#' - `fitted_params`: A list of the parameters used for iterative LSI.  Includes the following:
 #'    - `lsi_method`: The method used for LSI
 #'    - `cluster_method`: The method used for clustering
 #'    - `feature_means`: The means of the features used for tf-idf normalization
 #'    - `iterations`: The number of LSI iterations ran
-#'    - `iter_info`: A tibble with the following columns:
+#'    - `iter_info`: A tibble of iteration info with rows as iterations. Columns include the following:
 #'        - `iteration`: The iteration number
 #'        - `feature_names`: The names of the features used for the iteration
 #'        - `feature_loadings`: SVD component `u` with dimension `(n_dimensions, n_variable_features)`
+#'        - `pcs_to_keep`: The PCs that were kept after filtering by correlation to sequencing depth
 #'        - `clusters`: The clusters for the iteration.  This is blank for the first iteration
 #' @details
 #' The Iterative LSI method is as follows:
 #' - First iteration:
-#'    - Select features based on the `feature_selection_method` argument
+#'    - Select features using `feature_selection_method`
 #'    - Perform LSI on the selected features
 #'    - If `n_iterations` is 1, return the projected data from the first PCA projection
 #'    - Else, cluster the LSI results using `cluster_method`
@@ -450,12 +451,14 @@ IterativeLSI <- function(
     iter_info = tibble::tibble(
       iteration = integer(),
       feature_names = list(),
-      lsi_results = list(),
+      feature_loadings = list(),
+      pcs_to_keep = list(),
       clusters = list()
     )
   )
   if (verbose) log_progress("Starting Iterative LSI")
   for (i in seq_len(n_iterations)) {
+    
     if (verbose) log_progress(sprintf("Starting Iterative LSI iteration %s of %s", i, n_iterations))
     # add a blank row to the iter_info tibble
     fitted_params$iter_info <- tibble::add_row(fitted_params$iter_info, iteration = i)
@@ -480,21 +483,18 @@ IterativeLSI <- function(
       verbose = verbose,
       .missing_args_error = FALSE
     )(mat[mat_indices,])
-    fitted_params$iter_info$lsi_results[[i]] <- lsi_res_obj$fitted_params
-    # remove the feature means from the lsi results as they are already calculated
-    # save minimum info for lsi results if not onn terminal iteration
-    fitted_params$iter_info$lsi_results[[i]]$feature_means <- NULL
+    fitted_params$iter_info$feature_loadings[[i]] <- lsi_res_obj$fitted_params$feature_loadings
+    fitted_params$iter_info$pcs_to_keep[[i]] <- lsi_res_obj$fitted_params$pcs_to_keep
     # only cluster + pseudobulk if this isn't the last iteration
-    if (i == n_iterations) break 
+    if (i == n_iterations) break
     # cluster the LSI results
     if (verbose) log_progress("Clustering LSI results")
-    clustering_res <- t(lsi_res_obj$cell_embeddings) %>% partial_apply(cluster_method, threads = threads, .missing_args_error = FALSE)()
+    clustering_res <- t(lsi_res_obj$cell_embeddings[fitted_params$iter_info$pcs_to_keep[[i]], ]) %>%
+      partial_apply(cluster_method, threads = threads, .missing_args_error = FALSE)()
     fitted_params$iter_info$clusters[[i]] <- clustering_res
     # pseudobulk and pass onto next iteration
     if (verbose) log_progress("Pseudobulking matrix")
     pseudobulk_res <- pseudobulk_matrix(mat, clustering_res, threads = as.integer(threads))
-    # Only take the SVD information required to project the matrix
-    fitted_params$iter_info$lsi_results[[i]]$feature_loadings <- lsi_res_obj$fitted_params$feature_loadings
     rownames(pseudobulk_res) <- rownames(mat)
   }
   if (verbose) log_progress("Finished running Iterative LSI")
@@ -515,7 +515,7 @@ project.IterativeLSI <- function(x, mat, threads = 1L, ...) {
   assert_is_mat(mat)
   fitted_params <- x$fitted_params
   # Get the final row of fitted params
-  last_iter_info <- fitted_params$iter_info[nrow(fitted_params$iter_info),]
+  last_iter_info <- fitted_params$iter_info[nrow(fitted_params$iter_info), ]
 
   # Do a check to make sure that the fitted features all exist in input matrix
   if (!is.null(rownames(mat)) && !is.null(x$feature_names)) {
@@ -531,7 +531,7 @@ project.IterativeLSI <- function(x, mat, threads = 1L, ...) {
   # Run LSI
   # since we don't hold the LSI object, copy the internal logic from `project.LSI()`
   lsi_attr <- attr(x$fitted_params$lsi_method, "args")
-  
+
   mat <- normalize_tfidf(
     mat = mat,
     feature_means = fitted_params$feature_means,
@@ -542,11 +542,11 @@ project.IterativeLSI <- function(x, mat, threads = 1L, ...) {
     convert_matrix_type(mat, type = "float"),
     tempfile("mat"), compress = TRUE
   )
-  
-  feature_loadings <- last_iter_info$lsi_results[[1]]$feature_loadings
+
+  feature_loadings <- last_iter_info$feature_loadings[[1]]
   res <- t(feature_loadings) %*% mat
-  if (length(last_iter_info$lsi_results[[1]]$pcs_to_keep) != nrow(res)) {
-    res <- res[last_iter_info$lsi_results[[1]]$pcs_to_keep,]
+  if (length(last_iter_info$pcs_to_keep[[1]]) != nrow(res)) {
+    res <- res[last_iter_info$pcs_to_keep[[1]]$pcs_to_keep,]
   }
   return(res)
 }
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index 939d7e83..d80355e2 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -41,17 +41,18 @@ The user can pass in partial parameters to the cluster method, such as by passin
 \strong{IterativeLSI()} An object of class \code{c("IterativeLSI", "DimReduction")} with the following attributes:
 \itemize{
 \item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(n_dimensions, ncol(mat))}
-\item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
+\item \code{fitted_params}: A list of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 \itemize{
 \item \code{lsi_method}: The method used for LSI
 \item \code{cluster_method}: The method used for clustering
 \item \code{feature_means}: The means of the features used for tf-idf normalization
 \item \code{iterations}: The number of LSI iterations ran
-\item \code{iter_info}: A tibble with the following columns:
+\item \code{iter_info}: A tibble of iteration info with rows as iterations. Columns include the following:
 \itemize{
 \item \code{iteration}: The iteration number
 \item \code{feature_names}: The names of the features used for the iteration
 \item \code{feature_loadings}: SVD component \code{u} with dimension \verb{(n_dimensions, n_variable_features)}
+\item \code{pcs_to_keep}: The PCs that were kept after filtering by correlation to sequencing depth
 \item \code{clusters}: The clusters for the iteration.  This is blank for the first iteration
 }
 }

From b98518af05a09be83d68cad80294fd5354cbb5bc Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 19 Feb 2025 17:03:02 -0800
Subject: [PATCH 095/142] [r] clean up `IterativeLSI()`

---
 r/R/singlecell_utils.R                   | 47 ++++++++++++++----------
 r/man/IterativeLSI.Rd                    | 18 ++++++---
 r/tests/testthat/test-singlecell_utils.R |  4 +-
 3 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 967c528b..5cf9afd6 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -387,9 +387,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @param cluster_method (function) Method to use for clustering a kNN matrix. 
 #' Current builtin options are `cluster_graph_{leiden, louvain, seurat}()`.
 #' The user can pass in partial parameters to the cluster method, such as by passing 
-#' `cluster_graph_leiden(resolution = 0.5, knn_mat_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))`
-#' @param lsi_method (function) Method to use for LSI.  Only `LSI` is allowed.  The user can pass in partial parameters to `LSI` to customize the LSI method, 
-#' such as by passing `LSI(n_dimensions = 30, corr_cutoff = 0.5)`.
+#' `cluster_graph_leiden(resolution = 0.5, knn_mat_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))`.
 #' @return 
 #' **IterativeLSI()** An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data as a matrix of shape `(n_dimensions, ncol(mat))`
@@ -433,7 +431,9 @@ IterativeLSI <- function(
   mat, 
   n_iterations = 2,
   feature_selection_method = select_features_variance,
-  lsi_method = LSI,
+  scale_factor = 1e4, 
+  n_dimensions = 50L, 
+  corr_cutoff = 1,
   cluster_method = cluster_graph_leiden,
   threads = 1L, verbose = FALSE
 ) {
@@ -444,7 +444,7 @@ IterativeLSI <- function(
   assert_is_wholenumber(threads)
   assert_is(verbose, "logical")
   fitted_params <- list(
-    lsi_method = lsi_method,
+    lsi_method = LSI(n_dimensions = n_dimensions, corr_cutoff = corr_cutoff, scale_factor = scale_factor, threads = threads),
     cluster_method = cluster_method,
     feature_means = matrix_stats(mat, row_stats = "mean", threads = threads)$row_stats["mean",],
     iterations = n_iterations,
@@ -461,35 +461,42 @@ IterativeLSI <- function(
     
     if (verbose) log_progress(sprintf("Starting Iterative LSI iteration %s of %s", i, n_iterations))
     # add a blank row to the iter_info tibble
-    fitted_params$iter_info <- tibble::add_row(fitted_params$iter_info, iteration = i)
+    
     # run variable feature selection
     if (verbose) log_progress("Selecting features")
     if (i == 1) {
-      variable_features <- partial_apply(feature_selection_method, threads = threads, .missing_args_error = FALSE)(mat)
+      var_feature_table <- partial_apply(feature_selection_method, threads = threads, .missing_args_error = FALSE)(mat)
     } else {
-      variable_features <- partial_apply(feature_selection_method, threads = threads, .missing_args_error = FALSE)(pseudobulk_res)
+      var_feature_table <- partial_apply(feature_selection_method, threads = threads, .missing_args_error = FALSE)(pseudobulk_res)
     }
-    fitted_params$iter_info$feature_names[[i]] <- variable_features %>% dplyr::filter(highly_variable) %>% dplyr::pull(feature)
-    if (is.character(fitted_params$iter_info$feature_names[[i]])) {
-      mat_indices <- which(rownames(mat) %in% fitted_params$iter_info$feature_names[[i]])
+    variable_features <- var_feature_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(feature)
+    if (is.character(variable_features)) {
+      mat_indices <- which(rownames(mat) %in% variable_features)
     } else {
-      mat_indices <- fitted_params$iter_info$feature_names[[i]]
+      mat_indices <- variable_features
     }
     # run LSI
     if (verbose) log_progress("Running LSI")
-    lsi_res_obj <- partial_apply(
-      lsi_method,
+
+    lsi_res_obj <- LSI(
+      mat = mat[mat_indices,],
+      n_dimensions = n_dimensions,
+      scale_factor = scale_factor,
       threads = threads,
-      verbose = verbose,
-      .missing_args_error = FALSE
-    )(mat[mat_indices,])
-    fitted_params$iter_info$feature_loadings[[i]] <- lsi_res_obj$fitted_params$feature_loadings
-    fitted_params$iter_info$pcs_to_keep[[i]] <- lsi_res_obj$fitted_params$pcs_to_keep
+      verbose = verbose
+    )
     # only cluster + pseudobulk if this isn't the last iteration
+    fitted_params$iter_info <- tibble::add_row(
+      fitted_params$iter_info, iteration = i,
+      feature_names = list(variable_features),
+      feature_loadings = list(lsi_res_obj$fitted_params$feature_loadings),
+      pcs_to_keep = list(lsi_res_obj$fitted_params$pcs_to_keep)
+    )
     if (i == n_iterations) break
+    
     # cluster the LSI results
     if (verbose) log_progress("Clustering LSI results")
-    clustering_res <- t(lsi_res_obj$cell_embeddings[fitted_params$iter_info$pcs_to_keep[[i]], ]) %>%
+    clustering_res <- t(lsi_res_obj$cell_embeddings[lsi_res_obj$fitted_params$pcs_to_keep, ]) %>%
       partial_apply(cluster_method, threads = threads, .missing_args_error = FALSE)()
     fitted_params$iter_info$clusters[[i]] <- clustering_res
     # pseudobulk and pass onto next iteration
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index d80355e2..a4a02e6e 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -9,7 +9,9 @@ IterativeLSI(
   mat,
   n_iterations = 2,
   feature_selection_method = select_features_variance,
-  lsi_method = LSI,
+  scale_factor = 10000,
+  n_dimensions = 50L,
+  corr_cutoff = 1,
   cluster_method = cluster_graph_leiden,
   threads = 1L,
   verbose = FALSE
@@ -25,13 +27,17 @@ IterativeLSI(
 \item{feature_selection_method}{(function) Method to use for selecting features for each iteration after the first.
 Current builtin options are \code{select_features_variance}, \code{select_features_dispersion}, \code{select_features_mean}, \code{select_features_binned_dispersion}}
 
-\item{lsi_method}{(function) Method to use for LSI.  Only \code{LSI} is allowed.  The user can pass in partial parameters to \code{LSI} to customize the LSI method,
-such as by passing \code{LSI(n_dimensions = 30, corr_cutoff = 0.5)}.}
+\item{scale_factor}{(numeric) Scaling factor to multiply matrix by prior to log normalization (see formulas below).}
+
+\item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
+
+\item{corr_cutoff}{(numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
+the corr_cutoff, it will be excluded from the final PCA matrix.}
 
 \item{cluster_method}{(function) Method to use for clustering a kNN matrix.
 Current builtin options are \verb{cluster_graph_\{leiden, louvain, seurat\}()}.
 The user can pass in partial parameters to the cluster method, such as by passing
-\code{cluster_graph_leiden(resolution = 0.5, knn_mat_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))}}
+\code{cluster_graph_leiden(resolution = 0.5, knn_mat_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))}.}
 
 \item{threads}{(integer) Number of threads to use.}
 
@@ -41,7 +47,7 @@ The user can pass in partial parameters to the cluster method, such as by passin
 \strong{IterativeLSI()} An object of class \code{c("IterativeLSI", "DimReduction")} with the following attributes:
 \itemize{
 \item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(n_dimensions, ncol(mat))}
-\item \code{fitted_params}: A list of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
+\item \code{fitted_params}: A list of the parameters used for iterative LSI.  Includes the following:
 \itemize{
 \item \code{lsi_method}: The method used for LSI
 \item \code{cluster_method}: The method used for clustering
@@ -70,7 +76,7 @@ The Iterative LSI method is as follows:
 \itemize{
 \item First iteration:
 \itemize{
-\item Select features based on the \code{feature_selection_method} argument
+\item Select features using \code{feature_selection_method}
 \item Perform LSI on the selected features
 \item If \code{n_iterations} is 1, return the projected data from the first PCA projection
 \item Else, cluster the LSI results using \code{cluster_method}
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 61cb7f54..970c6632 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -238,7 +238,7 @@ test_that("Iterative LSI works", {
   mat <- matrix(data = runif(50000, 0, 1), nrow=500, ncol = 100) %>% as("dgCMatrix") %>% as("IterableMatrix")
   rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
   colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
-  lsi_res_obj <- expect_no_error(IterativeLSI(mat, n_iterations = 2, lsi_method = LSI(n_dimensions = 10), cluster_method = cluster_graph_louvain(knn_mat_method = knn_hnsw)))
+  lsi_res_obj <- expect_no_error(IterativeLSI(mat, n_iterations = 2, n_dimensions = 10L, cluster_method = cluster_graph_louvain(knn_mat_method = knn_hnsw)))
   lsi_res_proj <- project(lsi_res_obj, mat)
   lsi_res_embedding <- lsi_res_obj$cell_embeddings
   expect_equal(ncol(lsi_res_embedding), ncol(mat))
@@ -253,7 +253,7 @@ test_that("Iterative LSI works with parameterized clustering", {
   colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
   lsi_res_obj <- expect_no_error(
     IterativeLSI(
-      mat, lsi_method = LSI(n_dimensions = 10),
+      mat, n_dimensions = 10L,,
       cluster_method = cluster_graph_leiden(
         knn_mat_method = knn_annoy(k = 12),
         knn_graph_method = knn_to_snn_graph(min_val = 0.1)

From acc09649097289985a1017ec1e62434801bd0dd1 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 19 Feb 2025 17:26:45 -0800
Subject: [PATCH 096/142] [r] globally rename knn matrix to knn object

---
 r/R/clustering.R                       | 30 +++++++++++++-------------
 r/R/singlecell_utils.R                 |  2 +-
 r/man/IterativeLSI.Rd                  |  2 +-
 r/man/cluster.Rd                       |  6 +++---
 r/man/convert_mat_to_cluster_matrix.Rd | 12 +++++------
 r/man/is_knn_matrix.Rd                 |  4 ++--
 r/man/knn.Rd                           |  2 +-
 r/man/knn_graph.Rd                     |  4 ++--
 8 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/r/R/clustering.R b/r/R/clustering.R
index a4fb9f5e..e2a9afca 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -9,10 +9,10 @@
 
 #' Check if an input is a kNN output matrix
 #'
-#' knn matrix functions `knn_hnsw()` and `knn_annoy()` return a list of two matrices, `idx` and `dist`.
+#' knn object functions `knn_hnsw()` and `knn_annoy()` return a list of two matrices, `idx` and `dist`.
 #' These are used as inputs to create graph adjacency matrices for clustering.
 #' Assume any list with both `idx` and `dist` is a kNN object.
-#' @return TRUE if the mat is a kNN matrix, FALSE otherwise
+#' @return TRUE if the mat is a knn object, FALSE otherwise
 #' @keywords internal
 is_knn_matrix <- function(mat) {
   return(is(mat, "list") && all(c("idx", "dist") %in% names(mat)))
@@ -36,18 +36,18 @@ is_adjacency_matrix <- function(mat) {
 #'
 #' @param mat Input matrix to be converted.
 #' @param required_mat_type (character) Required matrix type: "adjacency" or "knn".
-#' @param knn_mat_method (function) Function to convert to a knn matrix (e.g., `knn_hnsw`, `knn_annoy`).
+#' @param knn_mat_method (function) Function to convert to a knn object (e.g., `knn_hnsw`, `knn_annoy`).
 #'   Ignored if `mat` is already a knn or graph matrix.
-#' @param knn_graph_method (function) Function to convert a knn matrix to a graph matrix
+#' @param knn_graph_method (function) Function to convert a knn object to a graph matrix
 #'   (e.g., `knn_to_geodesic_graph`). Ignored if `required_mat_type` is "knn" or if
 #'   `mat` is already a graph matrix.
 #' @details
 #' This function checks the type of the input matrix `mat`. `mat` is returned without modification if
 #' it is already the required type (adjacency or knn).
-#' If `mat` is a standard matrix and `required_mat_type` is "knn", then `knn_mat_method` is used to convert `mat` to a knn matrix.
-#' If `required_mat_type` is "adjacency", then `knn_mat_method` is used to first convert `mat` to a knn matrix,
-#' then `knn_graph_method` is used to convert the knn matrix to an adjacency matrix.
-#' @return The converted matrix.
+#' If `mat` is a standard matrix and `required_mat_type` is "knn", then `knn_mat_method` is used to convert `mat` to a knn object.
+#' If `required_mat_type` is "adjacency", then `knn_mat_method` is used to first convert `mat` to a knn object,
+#' then `knn_graph_method` is used to convert the knn object to an adjacency matrix.
+#' @return The converted matrix object.
 #' @keywords internal
 convert_mat_to_cluster_matrix <- function(
   mat,
@@ -60,7 +60,7 @@ convert_mat_to_cluster_matrix <- function(
     mat <- knn_mat_method(mat)
   }
   if (required_mat_type == "knn" && !is_knn_matrix(mat)) {
-    pretty_error(mat, "must be a knn matrix, or convertible to one", 1)
+    pretty_error(mat, "must be a knn object, or convertible to one", 1)
   }
   if (required_mat_type == "adjacency" && is_knn_matrix(mat)) {
     mat <- knn_graph_method(mat)
@@ -84,8 +84,8 @@ convert_mat_to_cluster_matrix <- function(
 #'  dist for cell x K neighbor distances
 #' @param use_weights boolean for whether to replace all distance weights with 1
 #' @param self_loops boolean for whether to allow cells to count themselves as neighbors
-#' @param knn_mat_method (function) if knn is not a knn matrix, this function will attempt to convert it to one. 
-#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn matrix.
+#' @param knn_mat_method (function) if knn is not a knn object, this function will attempt to convert it to one. 
+#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn object.
 #' @return **knn_to_graph**
 #'   Sparse matrix (dgCMatrix) where `mat[i,j]` = distance from cell `i` to
 #'   cell `j`, or 0 if cell `j` is not in the K nearest neighbors of `i`
@@ -215,9 +215,9 @@ knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=
 #' @param resolution Resolution parameter. Higher values result in more clusters
 #' @param objective_function Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below). 
 #'    For the meaning of each option, see `igraph::cluster_leiden()`.
-#' @param knn_mat_method (function) if snn represents a regular non-knn matrix, this function will attempt to convert it a knn matrix.
-#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn matrix.
-#' @param knn_graph_method (function) if snn represents a knn matrix, this function will attempt to convert it to a graph matrix.
+#' @param knn_mat_method (function) if snn represents a regular non-knn object, this function will attempt to convert it a knn object.
+#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn object.
+#' @param knn_graph_method (function) if snn represents a knn object, this function will attempt to convert it to a graph matrix.
 #' Must be a (optionally partialized) version of `knn_to_graph()`, `knn_to_snn_graph()` or `knn_to_geodesic_graph()`.  
 #' Ignored if snn is already a graph matrix.
 #' @param seed Random seed for clustering initialization
@@ -321,7 +321,7 @@ cluster_membership_matrix <- function(groups, group_order = NULL) {
 }
 
 
-#' Get a knn matrix from reduced dimensions
+#' Get a knn object from reduced dimensions
 #'
 #' Search for approximate nearest neighbors between cells in the reduced
 #' dimensions (e.g. PCA), and return the k nearest neighbors (knn) for each
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 5cf9afd6..6dd5fec3 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -384,7 +384,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @param n_iterations (int) The number of LSI iterations to perform.
 #' @param feature_selection_method (function) Method to use for selecting features for each iteration after the first. 
 #' Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_mean`, `select_features_binned_dispersion`
-#' @param cluster_method (function) Method to use for clustering a kNN matrix. 
+#' @param cluster_method (function) Method to use for clustering the post-SVD matrix. 
 #' Current builtin options are `cluster_graph_{leiden, louvain, seurat}()`.
 #' The user can pass in partial parameters to the cluster method, such as by passing 
 #' `cluster_graph_leiden(resolution = 0.5, knn_mat_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))`.
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index a4a02e6e..43bea792 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -34,7 +34,7 @@ Current builtin options are \code{select_features_variance}, \code{select_featur
 \item{corr_cutoff}{(numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
 the corr_cutoff, it will be excluded from the final PCA matrix.}
 
-\item{cluster_method}{(function) Method to use for clustering a kNN matrix.
+\item{cluster_method}{(function) Method to use for clustering the post-SVD matrix.
 Current builtin options are \verb{cluster_graph_\{leiden, louvain, seurat\}()}.
 The user can pass in partial parameters to the cluster method, such as by passing
 \code{cluster_graph_leiden(resolution = 0.5, knn_mat_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))}.}
diff --git a/r/man/cluster.Rd b/r/man/cluster.Rd
index ea677ae5..827d026e 100644
--- a/r/man/cluster.Rd
+++ b/r/man/cluster.Rd
@@ -40,10 +40,10 @@ cluster_graph_seurat(
 \item{objective_function}{Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below).
 For the meaning of each option, see \code{igraph::cluster_leiden()}.}
 
-\item{knn_mat_method}{(function) if snn represents a regular non-knn matrix, this function will attempt to convert it a knn matrix.
-Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn matrix.}
+\item{knn_mat_method}{(function) if snn represents a regular non-knn object, this function will attempt to convert it a knn object.
+Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn object.}
 
-\item{knn_graph_method}{(function) if snn represents a knn matrix, this function will attempt to convert it to a graph matrix.
+\item{knn_graph_method}{(function) if snn represents a knn object, this function will attempt to convert it to a graph matrix.
 Must be a (optionally partialized) version of \code{knn_to_graph()}, \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}.
 Ignored if snn is already a graph matrix.}
 
diff --git a/r/man/convert_mat_to_cluster_matrix.Rd b/r/man/convert_mat_to_cluster_matrix.Rd
index 1842dc9e..bff29c4b 100644
--- a/r/man/convert_mat_to_cluster_matrix.Rd
+++ b/r/man/convert_mat_to_cluster_matrix.Rd
@@ -16,15 +16,15 @@ convert_mat_to_cluster_matrix(
 
 \item{required_mat_type}{(character) Required matrix type: "adjacency" or "knn".}
 
-\item{knn_mat_method}{(function) Function to convert to a knn matrix (e.g., \code{knn_hnsw}, \code{knn_annoy}).
+\item{knn_mat_method}{(function) Function to convert to a knn object (e.g., \code{knn_hnsw}, \code{knn_annoy}).
 Ignored if \code{mat} is already a knn or graph matrix.}
 
-\item{knn_graph_method}{(function) Function to convert a knn matrix to a graph matrix
+\item{knn_graph_method}{(function) Function to convert a knn object to a graph matrix
 (e.g., \code{knn_to_geodesic_graph}). Ignored if \code{required_mat_type} is "knn" or if
 \code{mat} is already a graph matrix.}
 }
 \value{
-The converted matrix.
+The converted matrix object.
 }
 \description{
 Ensures that the input matrix is converted to the correct type (knn or adjacency)
@@ -34,8 +34,8 @@ it is returned as is.
 \details{
 This function checks the type of the input matrix \code{mat}. \code{mat} is returned without modification if
 it is already the required type (adjacency or knn).
-If \code{mat} is a standard matrix and \code{required_mat_type} is "knn", then \code{knn_mat_method} is used to convert \code{mat} to a knn matrix.
-If \code{required_mat_type} is "adjacency", then \code{knn_mat_method} is used to first convert \code{mat} to a knn matrix,
-then \code{knn_graph_method} is used to convert the knn matrix to an adjacency matrix.
+If \code{mat} is a standard matrix and \code{required_mat_type} is "knn", then \code{knn_mat_method} is used to convert \code{mat} to a knn object.
+If \code{required_mat_type} is "adjacency", then \code{knn_mat_method} is used to first convert \code{mat} to a knn object,
+then \code{knn_graph_method} is used to convert the knn object to an adjacency matrix.
 }
 \keyword{internal}
diff --git a/r/man/is_knn_matrix.Rd b/r/man/is_knn_matrix.Rd
index fc84877a..85bbf5f2 100644
--- a/r/man/is_knn_matrix.Rd
+++ b/r/man/is_knn_matrix.Rd
@@ -7,10 +7,10 @@
 is_knn_matrix(mat)
 }
 \value{
-TRUE if the mat is a kNN matrix, FALSE otherwise
+TRUE if the mat is a knn object, FALSE otherwise
 }
 \description{
-knn matrix functions \code{knn_hnsw()} and \code{knn_annoy()} return a list of two matrices, \code{idx} and \code{dist}.
+knn object functions \code{knn_hnsw()} and \code{knn_annoy()} return a list of two matrices, \code{idx} and \code{dist}.
 These are used as inputs to create graph adjacency matrices for clustering.
 Assume any list with both \code{idx} and \code{dist} is a kNN object.
 }
diff --git a/r/man/knn.Rd b/r/man/knn.Rd
index 5952e69b..b92cc250 100644
--- a/r/man/knn.Rd
+++ b/r/man/knn.Rd
@@ -3,7 +3,7 @@
 \name{knn_hnsw}
 \alias{knn_hnsw}
 \alias{knn_annoy}
-\title{Get a knn matrix from reduced dimensions}
+\title{Get a knn object from reduced dimensions}
 \usage{
 knn_hnsw(
   data,
diff --git a/r/man/knn_graph.Rd b/r/man/knn_graph.Rd
index 5f165b82..574fc8e0 100644
--- a/r/man/knn_graph.Rd
+++ b/r/man/knn_graph.Rd
@@ -36,8 +36,8 @@ dist for cell x K neighbor distances}
 
 \item{self_loops}{Whether to allow self-loops in the output graph}
 
-\item{knn_mat_method}{(function) if knn is not a knn matrix, this function will attempt to convert it to one.
-Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn matrix.}
+\item{knn_mat_method}{(function) if knn is not a knn object, this function will attempt to convert it to one.
+Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn object.}
 
 \item{min_val}{minimum jaccard index between neighbors. Values below this will
 round to 0}

From 24d7e6953f32bc53b64e20c6dc1fc2f372083c82 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 19 Feb 2025 17:51:07 -0800
Subject: [PATCH 097/142] [r] update clustering functions to use `mat` instead
 of `snn`

---
 r/R/clustering.R                       | 70 +++++++++++++-------------
 r/man/cluster.Rd                       | 16 +++---
 r/man/convert_mat_to_cluster_matrix.Rd | 21 ++++----
 r/man/is_knn_matrix.Rd                 |  6 +--
 r/man/knn.Rd                           |  2 +-
 r/man/knn_graph.Rd                     |  4 +-
 6 files changed, 59 insertions(+), 60 deletions(-)

diff --git a/r/R/clustering.R b/r/R/clustering.R
index e2a9afca..43836f15 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -9,10 +9,10 @@
 
 #' Check if an input is a kNN output matrix
 #'
-#' knn object functions `knn_hnsw()` and `knn_annoy()` return a list of two matrices, `idx` and `dist`.
+#' knn matrix functions `knn_hnsw()` and `knn_annoy()` return a list of two matrices, `idx` and `dist`.
 #' These are used as inputs to create graph adjacency matrices for clustering.
-#' Assume any list with both `idx` and `dist` is a kNN object.
-#' @return TRUE if the mat is a knn object, FALSE otherwise
+#' Assume any list with at least both `idx` and `dist` items is a kNN object.
+#' @return TRUE if the mat is a kNN matrix, FALSE otherwise
 #' @keywords internal
 is_knn_matrix <- function(mat) {
   return(is(mat, "list") && all(c("idx", "dist") %in% names(mat)))
@@ -36,18 +36,18 @@ is_adjacency_matrix <- function(mat) {
 #'
 #' @param mat Input matrix to be converted.
 #' @param required_mat_type (character) Required matrix type: "adjacency" or "knn".
-#' @param knn_mat_method (function) Function to convert to a knn object (e.g., `knn_hnsw`, `knn_annoy`).
+#' @param knn_mat_method (function) Function to convert to a knn matrix (e.g., `knn_hnsw`, `knn_annoy`).
 #'   Ignored if `mat` is already a knn or graph matrix.
-#' @param knn_graph_method (function) Function to convert a knn object to a graph matrix
+#' @param knn_graph_method (function) Function to convert a knn matrix to a graph matrix
 #'   (e.g., `knn_to_geodesic_graph`). Ignored if `required_mat_type` is "knn" or if
 #'   `mat` is already a graph matrix.
-#' @details
+#' #' @details
 #' This function checks the type of the input matrix `mat`. `mat` is returned without modification if
 #' it is already the required type (adjacency or knn).
-#' If `mat` is a standard matrix and `required_mat_type` is "knn", then `knn_mat_method` is used to convert `mat` to a knn object.
-#' If `required_mat_type` is "adjacency", then `knn_mat_method` is used to first convert `mat` to a knn object,
-#' then `knn_graph_method` is used to convert the knn object to an adjacency matrix.
-#' @return The converted matrix object.
+#' If `mat` is a standard matrix and `required_mat_type` is "knn", then `knn_mat_method` is used to convert `mat` to a knn matrix.
+#' If `required_mat_type` is "adjacency", then `knn_mat_method` is used to first convert `mat` to a knn matrix, 
+#' then `knn_graph_method` is used to convert the knn matrix to an adjacency matrix.
+#' @return The converted matrix.
 #' @keywords internal
 convert_mat_to_cluster_matrix <- function(
   mat,
@@ -60,7 +60,7 @@ convert_mat_to_cluster_matrix <- function(
     mat <- knn_mat_method(mat)
   }
   if (required_mat_type == "knn" && !is_knn_matrix(mat)) {
-    pretty_error(mat, "must be a knn object, or convertible to one", 1)
+    pretty_error(mat, "must be a knn matrix, or convertible to one", 1)
   }
   if (required_mat_type == "adjacency" && is_knn_matrix(mat)) {
     mat <- knn_graph_method(mat)
@@ -84,8 +84,8 @@ convert_mat_to_cluster_matrix <- function(
 #'  dist for cell x K neighbor distances
 #' @param use_weights boolean for whether to replace all distance weights with 1
 #' @param self_loops boolean for whether to allow cells to count themselves as neighbors
-#' @param knn_mat_method (function) if knn is not a knn object, this function will attempt to convert it to one. 
-#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn object.
+#' @param knn_mat_method (function) if knn is not a knn matrix, this function will attempt to convert it to one. 
+#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn matrix.
 #' @return **knn_to_graph**
 #'   Sparse matrix (dgCMatrix) where `mat[i,j]` = distance from cell `i` to
 #'   cell `j`, or 0 if cell `j` is not in the K nearest neighbors of `i`
@@ -190,7 +190,7 @@ knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=
   return_type <- match.arg(return_type)
   assert_is_wholenumber(threads)
   if (rlang::is_missing(knn)) return(create_partial())
-  knn_mat_method <- partial_apply(knn_mat_method, threads = threads, .missing_args_error = FALSE)
+  knn_mat_method <- partial_apply(knn_mat_method, threads = threads)
   knn <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_mat_method = knn_mat_method)
   graph <- build_umap_graph_cpp(knn$dist, knn$idx, threads=threads)
   
@@ -211,28 +211,28 @@ knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=
 #'    Note that when using `objective_function = "CPM"` the number of clusters empirically scales with `cells * resolution`,
 #'    so 1e-3 is a good resolution for 10k cells, but 1M cells is better with a 1e-5 resolution. A resolution of 1 is a 
 #'    good default when `objective_function = "modularity"` per the default.
-#' @param snn Symmetric adjacency matrix (dgCMatrix) output from e.g. `knn_to_snn_graph()` or `knn_to_geodesic_graph()`. Only the lower triangle is used
+#' @param mat Symmetric adjacency matrix (dgCMatrix) output from e.g. `knn_to_snn_graph()` or `knn_to_geodesic_graph()`. Only the lower triangle is used
 #' @param resolution Resolution parameter. Higher values result in more clusters
 #' @param objective_function Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below). 
 #'    For the meaning of each option, see `igraph::cluster_leiden()`.
-#' @param knn_mat_method (function) if snn represents a regular non-knn object, this function will attempt to convert it a knn object.
-#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn object.
-#' @param knn_graph_method (function) if snn represents a knn object, this function will attempt to convert it to a graph matrix.
+#' @param knn_mat_method (function) if mat represents a regular non-knn matrix, this function will attempt to convert it a knn matrix.
+#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn matrix.
+#' @param knn_graph_method (function) if mat represents a knn matrix, this function will attempt to convert it to a graph matrix.
 #' Must be a (optionally partialized) version of `knn_to_graph()`, `knn_to_snn_graph()` or `knn_to_geodesic_graph()`.  
-#' Ignored if snn is already a graph matrix.
+#' Ignored if mat is already a graph adjacency matrix.
 #' @param seed Random seed for clustering initialization
 #' @param ... Additional arguments to underlying clustering function
 #' @return Factor vector containing the cluster assignment for each cell.
 #' @export
 cluster_graph_leiden <- function(
-  snn, resolution = 1, objective_function = c("modularity", "CPM"), 
+  mat, resolution = 1, objective_function = c("modularity", "CPM"), 
   knn_mat_method = knn_hnsw, knn_graph_method = knn_to_geodesic_graph, seed = 12531, ...
 ) {
   assert_has_package("igraph")
   # Set seed without permanently changing seed state
-  if (rlang::is_missing(snn)) return(create_partial())
-  snn <- convert_mat_to_cluster_matrix(
-    snn, required_mat_type = "adjacency", knn_mat_method = knn_mat_method,
+  if (rlang::is_missing(mat)) return(create_partial())
+  mat <- convert_mat_to_cluster_matrix(
+    mat, required_mat_type = "adjacency", knn_mat_method = knn_mat_method,
     knn_graph_method = knn_graph_method
   )
   prev_seed <- get_seed()
@@ -241,7 +241,7 @@ cluster_graph_leiden <- function(
 
   objective_function <- match.arg(objective_function)
 
-  igraph::graph_from_adjacency_matrix(snn, weighted = TRUE, diag = FALSE, mode = "lower") %>%
+  igraph::graph_from_adjacency_matrix(mat, weighted = TRUE, diag = FALSE, mode = "lower") %>%
     igraph::cluster_leiden(resolution_parameter = resolution, objective_function=objective_function, ...) %>%
     igraph::membership() %>%
     as.factor()
@@ -252,14 +252,14 @@ cluster_graph_leiden <- function(
 #' @details **cluster_graph_louvain**: Louvain graph clustering algorithm `igraph::cluster_louvain()`
 #' @export
 cluster_graph_louvain <- function(
-  snn, resolution = 1, knn_mat_method = knn_hnsw, 
+  mat, resolution = 1, knn_mat_method = knn_hnsw, 
   knn_graph_method = knn_to_geodesic_graph, seed = 12531
 ) {
   assert_has_package("igraph")
   # Set seed without permanently changing seed state
-  if (rlang::is_missing(snn)) return(create_partial())
-  snn <- convert_mat_to_cluster_matrix(
-    snn, required_mat_type = "adjacency", knn_mat_method = knn_mat_method,
+  if (rlang::is_missing(mat)) return(create_partial())
+  mat <- convert_mat_to_cluster_matrix(
+    mat, required_mat_type = "adjacency", knn_mat_method = knn_mat_method,
     knn_graph_method = knn_graph_method
   )
 
@@ -267,7 +267,7 @@ cluster_graph_louvain <- function(
   on.exit(restore_seed(prev_seed), add = TRUE)
   set.seed(seed)
 
-  igraph::graph_from_adjacency_matrix(snn, weighted = TRUE, diag = FALSE, mode = "lower") %>%
+  igraph::graph_from_adjacency_matrix(mat, weighted = TRUE, diag = FALSE, mode = "lower") %>%
     igraph::cluster_louvain(resolution = resolution) %>%
     igraph::membership() %>%
     as.factor()
@@ -277,16 +277,16 @@ cluster_graph_louvain <- function(
 #' @details **cluster_graph_seurat**: Seurat's clustering algorithm `Seurat::FindClusters()`
 #' @export
 cluster_graph_seurat <- function(
-  snn, resolution = 0.8, knn_mat_method = knn_hnsw, 
+  mat, resolution = 0.8, knn_mat_method = knn_hnsw, 
   knn_graph_method = knn_to_geodesic_graph, ...
 ) {
   assert_has_package("Seurat")
-  if (rlang::is_missing(snn)) return(create_partial())
-  snn <- convert_mat_to_cluster_matrix(
-    snn, required_mat_type = "adjacency", knn_mat_method = knn_mat_method,
+  if (rlang::is_missing(mat)) return(create_partial())
+  mat <- convert_mat_to_cluster_matrix(
+    mat, required_mat_type = "adjacency", knn_mat_method = knn_mat_method,
     knn_graph_method = knn_graph_method
   )
-  Seurat::as.Graph(snn) %>%
+  Seurat::as.Graph(mat) %>%
     Seurat::FindClusters(resolution = resolution, ...) %>%
     .[[1]]
 }
@@ -321,7 +321,7 @@ cluster_membership_matrix <- function(groups, group_order = NULL) {
 }
 
 
-#' Get a knn object from reduced dimensions
+#' Get a knn matrix from reduced dimensions
 #'
 #' Search for approximate nearest neighbors between cells in the reduced
 #' dimensions (e.g. PCA), and return the k nearest neighbors (knn) for each
diff --git a/r/man/cluster.Rd b/r/man/cluster.Rd
index 827d026e..cb19146c 100644
--- a/r/man/cluster.Rd
+++ b/r/man/cluster.Rd
@@ -7,7 +7,7 @@
 \title{Cluster an adjacency matrix}
 \usage{
 cluster_graph_leiden(
-  snn,
+  mat,
   resolution = 1,
   objective_function = c("modularity", "CPM"),
   knn_mat_method = knn_hnsw,
@@ -17,7 +17,7 @@ cluster_graph_leiden(
 )
 
 cluster_graph_louvain(
-  snn,
+  mat,
   resolution = 1,
   knn_mat_method = knn_hnsw,
   knn_graph_method = knn_to_geodesic_graph,
@@ -25,7 +25,7 @@ cluster_graph_louvain(
 )
 
 cluster_graph_seurat(
-  snn,
+  mat,
   resolution = 0.8,
   knn_mat_method = knn_hnsw,
   knn_graph_method = knn_to_geodesic_graph,
@@ -33,19 +33,19 @@ cluster_graph_seurat(
 )
 }
 \arguments{
-\item{snn}{Symmetric adjacency matrix (dgCMatrix) output from e.g. \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}. Only the lower triangle is used}
+\item{mat}{Symmetric adjacency matrix (dgCMatrix) output from e.g. \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}. Only the lower triangle is used}
 
 \item{resolution}{Resolution parameter. Higher values result in more clusters}
 
 \item{objective_function}{Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below).
 For the meaning of each option, see \code{igraph::cluster_leiden()}.}
 
-\item{knn_mat_method}{(function) if snn represents a regular non-knn object, this function will attempt to convert it a knn object.
-Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn object.}
+\item{knn_mat_method}{(function) if mat represents a regular non-knn matrix, this function will attempt to convert it a knn matrix.
+Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn matrix.}
 
-\item{knn_graph_method}{(function) if snn represents a knn object, this function will attempt to convert it to a graph matrix.
+\item{knn_graph_method}{(function) if mat represents a knn matrix, this function will attempt to convert it to a graph matrix.
 Must be a (optionally partialized) version of \code{knn_to_graph()}, \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}.
-Ignored if snn is already a graph matrix.}
+Ignored if mat is already a graph adjacency matrix.}
 
 \item{seed}{Random seed for clustering initialization}
 
diff --git a/r/man/convert_mat_to_cluster_matrix.Rd b/r/man/convert_mat_to_cluster_matrix.Rd
index bff29c4b..9e42e331 100644
--- a/r/man/convert_mat_to_cluster_matrix.Rd
+++ b/r/man/convert_mat_to_cluster_matrix.Rd
@@ -16,26 +16,25 @@ convert_mat_to_cluster_matrix(
 
 \item{required_mat_type}{(character) Required matrix type: "adjacency" or "knn".}
 
-\item{knn_mat_method}{(function) Function to convert to a knn object (e.g., \code{knn_hnsw}, \code{knn_annoy}).
+\item{knn_mat_method}{(function) Function to convert to a knn matrix (e.g., \code{knn_hnsw}, \code{knn_annoy}).
 Ignored if \code{mat} is already a knn or graph matrix.}
 
-\item{knn_graph_method}{(function) Function to convert a knn object to a graph matrix
+\item{knn_graph_method}{(function) Function to convert a knn matrix to a graph matrix
 (e.g., \code{knn_to_geodesic_graph}). Ignored if \code{required_mat_type} is "knn" or if
-\code{mat} is already a graph matrix.}
+\code{mat} is already a graph matrix.
+#' @details
+This function checks the type of the input matrix \code{mat}. \code{mat} is returned without modification if
+it is already the required type (adjacency or knn).
+If \code{mat} is a standard matrix and \code{required_mat_type} is "knn", then \code{knn_mat_method} is used to convert \code{mat} to a knn matrix.
+If \code{required_mat_type} is "adjacency", then \code{knn_mat_method} is used to first convert \code{mat} to a knn matrix,
+then \code{knn_graph_method} is used to convert the knn matrix to an adjacency matrix.}
 }
 \value{
-The converted matrix object.
+The converted matrix.
 }
 \description{
 Ensures that the input matrix is converted to the correct type (knn or adjacency)
 required by a clustering function. If the input is already of the correct type,
 it is returned as is.
 }
-\details{
-This function checks the type of the input matrix \code{mat}. \code{mat} is returned without modification if
-it is already the required type (adjacency or knn).
-If \code{mat} is a standard matrix and \code{required_mat_type} is "knn", then \code{knn_mat_method} is used to convert \code{mat} to a knn object.
-If \code{required_mat_type} is "adjacency", then \code{knn_mat_method} is used to first convert \code{mat} to a knn object,
-then \code{knn_graph_method} is used to convert the knn object to an adjacency matrix.
-}
 \keyword{internal}
diff --git a/r/man/is_knn_matrix.Rd b/r/man/is_knn_matrix.Rd
index 85bbf5f2..c3103d97 100644
--- a/r/man/is_knn_matrix.Rd
+++ b/r/man/is_knn_matrix.Rd
@@ -7,11 +7,11 @@
 is_knn_matrix(mat)
 }
 \value{
-TRUE if the mat is a knn object, FALSE otherwise
+TRUE if the mat is a kNN matrix, FALSE otherwise
 }
 \description{
-knn object functions \code{knn_hnsw()} and \code{knn_annoy()} return a list of two matrices, \code{idx} and \code{dist}.
+knn matrix functions \code{knn_hnsw()} and \code{knn_annoy()} return a list of two matrices, \code{idx} and \code{dist}.
 These are used as inputs to create graph adjacency matrices for clustering.
-Assume any list with both \code{idx} and \code{dist} is a kNN object.
+Assume any list with at least both \code{idx} and \code{dist} items is a kNN object.
 }
 \keyword{internal}
diff --git a/r/man/knn.Rd b/r/man/knn.Rd
index b92cc250..5952e69b 100644
--- a/r/man/knn.Rd
+++ b/r/man/knn.Rd
@@ -3,7 +3,7 @@
 \name{knn_hnsw}
 \alias{knn_hnsw}
 \alias{knn_annoy}
-\title{Get a knn object from reduced dimensions}
+\title{Get a knn matrix from reduced dimensions}
 \usage{
 knn_hnsw(
   data,
diff --git a/r/man/knn_graph.Rd b/r/man/knn_graph.Rd
index 574fc8e0..5f165b82 100644
--- a/r/man/knn_graph.Rd
+++ b/r/man/knn_graph.Rd
@@ -36,8 +36,8 @@ dist for cell x K neighbor distances}
 
 \item{self_loops}{Whether to allow self-loops in the output graph}
 
-\item{knn_mat_method}{(function) if knn is not a knn object, this function will attempt to convert it to one.
-Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn object.}
+\item{knn_mat_method}{(function) if knn is not a knn matrix, this function will attempt to convert it to one.
+Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn matrix.}
 
 \item{min_val}{minimum jaccard index between neighbors. Values below this will
 round to 0}

From b3ba6a3328513f1277e632b6087dab0ce1be2fe5 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 19 Feb 2025 17:51:16 -0800
Subject: [PATCH 098/142] update NEWS.md

---
 r/NEWS.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/r/NEWS.md b/r/NEWS.md
index f12b7042..d966de80 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -14,11 +14,13 @@ Contributions welcome :)
 - Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #189)
 - Add feature selection functions `select_features_variance()`, and `select_features_{dispersion,mean,binned_dispersion}()`, with parameterization for normalization steps, and number of variable features (pull request #189)
 - Add `LSI()` and `IterativeLSI()` dimensionality functions to perform latent semantic indexing on a matrix (pull request #189).
+- Add capability to create partial function objects in when excluding the first argument of a function.  This is implemented in normalizations, feature selections, dimensionality reductions, and clustering functions. See `select_features_variance()` for usage.  (pull request #189)
 
 ## Improvements
 - Speed up taking large subsets of large concatenated matrices, e.g. selecting 9M cells from a 10M cell matrix composed of ~100 concatenated pieces. (pull request #179)
 - `matrix_stats()` now also works with types `matrix` and `dgCMatrix`. (pull request #190)
 - Fixed memory errors when running `writeInsertionBed()` and `writeInsertionBedGraph()` (pull request #{118, 134})
+- Change parameter name of `cluster_graph_leiden()`, `cluster_graph_louvain()` and `cluster_graph_seurat()` from `snn` to `mat` to more accurately reflect the input type.  (pull request #189)
 
 ## Bug-fixes
 - Fix error message printing when MACS crashes during `call_peaks_macs()` (pull request #175)

From 0e6b17697ce7a47d6481302cc4a46b39f1444281 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 19 Feb 2025 18:09:43 -0800
Subject: [PATCH 099/142] [r] add threads and verbose arguments to cluster
 functions

---
 r/R/clustering.R                       | 42 +++++++++++++++-----------
 r/man/cluster.Rd                       | 12 +++++++-
 r/man/convert_mat_to_cluster_matrix.Rd |  4 ++-
 r/man/knn_graph.Rd                     | 21 ++++++++-----
 4 files changed, 53 insertions(+), 26 deletions(-)

diff --git a/r/R/clustering.R b/r/R/clustering.R
index 43836f15..ff3fd3b4 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -53,17 +53,19 @@ convert_mat_to_cluster_matrix <- function(
   mat,
   required_mat_type = c("knn", "adjacency"),
   knn_mat_method = knn_hnsw,
-  knn_graph_method = knn_to_geodesic_graph
+  knn_graph_method = knn_to_geodesic_graph,
+  threads = 1L,
+  verbose = FALSE
 ) {
   required_mat_type <- match.arg(required_mat_type)
   if (is(mat, "matrix")) {
-    mat <- knn_mat_method(mat)
+    mat <- partial_apply(knn_mat_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
   }
   if (required_mat_type == "knn" && !is_knn_matrix(mat)) {
     pretty_error(mat, "must be a knn matrix, or convertible to one", 1)
   }
   if (required_mat_type == "adjacency" && is_knn_matrix(mat)) {
-    mat <- knn_graph_method(mat)
+    mat <- partial_apply(knn_graph_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
   }
   if (required_mat_type == "adjacency" && !is_adjacency_matrix(mat)) {
     pretty_error(mat, "must be a graph adjacency matrix, or convertible to one", 1)
@@ -86,14 +88,18 @@ convert_mat_to_cluster_matrix <- function(
 #' @param self_loops boolean for whether to allow cells to count themselves as neighbors
 #' @param knn_mat_method (function) if knn is not a knn matrix, this function will attempt to convert it to one. 
 #' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn matrix.
+#' @param threads (integer) Number of threads to use.
+#' @param verbose (logical) Whether to print progress information during search
 #' @return **knn_to_graph**
 #'   Sparse matrix (dgCMatrix) where `mat[i,j]` = distance from cell `i` to
 #'   cell `j`, or 0 if cell `j` is not in the K nearest neighbors of `i`
-knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE, knn_mat_method = knn_hnsw) {
+knn_to_graph <- function(knn, use_weights = FALSE, knn_mat_method = knn_hnsw, self_loops = TRUE, threads = 0L, verbose = FALSE) {
   assert_is(use_weights, "logical")
   assert_is(self_loops, "logical")
+  assert_is_wholenumber(threads)
+  assert_is(verbose, "logical")
   if (rlang::is_missing(knn)) return(create_partial())
-  mat <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_mat_method = knn_mat_method)
+  mat <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_mat_method = knn_mat_method, threads = threads, verbose = verbose)
   if (use_weights) {
     weights <- knn$dist
   } else {
@@ -132,11 +138,13 @@ knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE, knn_mat_me
 #'      These correspond to the rows, cols, and values of non-zero entries in the lower triangle
 #'      adjacency matrix. `dim` is the total number of vertices (cells) in the graph 
 #' @export
-knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_type=c("matrix", "list"), knn_mat_method = knn_hnsw) {
+knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, knn_mat_method = knn_hnsw, return_type=c("matrix", "list"), threads = 0L, verbose = FALSE) {
   return_type <- match.arg(return_type)
   assert_is(self_loops, "logical")
+  assert_is_wholenumber(threads)
+  assert_is(verbose, "logical")
   if (rlang::is_missing(knn)) return(create_partial())
-  knn <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_mat_method = knn_mat_method)
+  knn <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_mat_method = knn_mat_method, threads = threads, verbose = verbose)
   # Solve x / (2*K - x) >= min_val --> x >= 2*K*min_val / (1 + min_val)
   min_int <- ceiling(2*min_val*ncol(knn$idx) / (1 + min_val))
   snn <- build_snn_graph_cpp(knn$idx, min_neighbors = min_int)
@@ -176,7 +184,6 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t
 #'  neighbor, results may differ slightly from `umap._umap.fuzzy_simplicial_set`, which
 #'  assumes self is always successfully found in the approximate nearest neighbor search.
 #'  
-#' @param threads Number of threads to use during calculations
 #' @return **knn_to_geodesic_graph**
 #' - `return_type == "matrix"`:
 #'      Sparse matrix (dgCMatrix) where `mat[i,j]` = normalized similarity between cell `i` and cell `j`.
@@ -186,12 +193,11 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t
 #'      These correspond to the rows, cols, and values of non-zero entries in the lower triangle
 #'      adjacency matrix. `dim` is the total number of vertices (cells) in the graph 
 #' @export
-knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=0L, knn_mat_method = knn_hnsw) {
+knn_to_geodesic_graph <- function(knn, knn_mat_method = knn_hnsw, return_type = c("matrix", "list"), threads = 0L, verbose = FALSE) {
   return_type <- match.arg(return_type)
   assert_is_wholenumber(threads)
   if (rlang::is_missing(knn)) return(create_partial())
-  knn_mat_method <- partial_apply(knn_mat_method, threads = threads)
-  knn <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_mat_method = knn_mat_method)
+  knn <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_mat_method = knn_mat_method, threads = threads, verbose = verbose)
   graph <- build_umap_graph_cpp(knn$dist, knn$idx, threads=threads)
   
   graph$dim <- nrow(knn$idx)
@@ -221,19 +227,21 @@ knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=
 #' Must be a (optionally partialized) version of `knn_to_graph()`, `knn_to_snn_graph()` or `knn_to_geodesic_graph()`.  
 #' Ignored if mat is already a graph adjacency matrix.
 #' @param seed Random seed for clustering initialization
+#' @param threads (integer) Number of threads to use.
+#' @param verbose (logical) Whether to print progress information during search
 #' @param ... Additional arguments to underlying clustering function
 #' @return Factor vector containing the cluster assignment for each cell.
 #' @export
 cluster_graph_leiden <- function(
   mat, resolution = 1, objective_function = c("modularity", "CPM"), 
-  knn_mat_method = knn_hnsw, knn_graph_method = knn_to_geodesic_graph, seed = 12531, ...
+  knn_mat_method = knn_hnsw, knn_graph_method = knn_to_geodesic_graph, seed = 12531, threads = 0L, verbose = FALSE, ...
 ) {
   assert_has_package("igraph")
   # Set seed without permanently changing seed state
   if (rlang::is_missing(mat)) return(create_partial())
   mat <- convert_mat_to_cluster_matrix(
     mat, required_mat_type = "adjacency", knn_mat_method = knn_mat_method,
-    knn_graph_method = knn_graph_method
+    knn_graph_method = knn_graph_method, threads = threads, verbose = verbose
   )
   prev_seed <- get_seed()
   on.exit(restore_seed(prev_seed), add = TRUE)
@@ -253,14 +261,14 @@ cluster_graph_leiden <- function(
 #' @export
 cluster_graph_louvain <- function(
   mat, resolution = 1, knn_mat_method = knn_hnsw, 
-  knn_graph_method = knn_to_geodesic_graph, seed = 12531
+  knn_graph_method = knn_to_geodesic_graph, seed = 12531, threads = 0L, verbose = FALSE
 ) {
   assert_has_package("igraph")
   # Set seed without permanently changing seed state
   if (rlang::is_missing(mat)) return(create_partial())
   mat <- convert_mat_to_cluster_matrix(
     mat, required_mat_type = "adjacency", knn_mat_method = knn_mat_method,
-    knn_graph_method = knn_graph_method
+    knn_graph_method = knn_graph_method, threads = threads, verbose = verbose
   )
 
   prev_seed <- get_seed()
@@ -278,13 +286,13 @@ cluster_graph_louvain <- function(
 #' @export
 cluster_graph_seurat <- function(
   mat, resolution = 0.8, knn_mat_method = knn_hnsw, 
-  knn_graph_method = knn_to_geodesic_graph, ...
+  knn_graph_method = knn_to_geodesic_graph, threads = 0L, verbose = FALSE, ...
 ) {
   assert_has_package("Seurat")
   if (rlang::is_missing(mat)) return(create_partial())
   mat <- convert_mat_to_cluster_matrix(
     mat, required_mat_type = "adjacency", knn_mat_method = knn_mat_method,
-    knn_graph_method = knn_graph_method
+    knn_graph_method = knn_graph_method, threads = threads, verbose = verbose
   )
   Seurat::as.Graph(mat) %>%
     Seurat::FindClusters(resolution = resolution, ...) %>%
diff --git a/r/man/cluster.Rd b/r/man/cluster.Rd
index cb19146c..a4580925 100644
--- a/r/man/cluster.Rd
+++ b/r/man/cluster.Rd
@@ -13,6 +13,8 @@ cluster_graph_leiden(
   knn_mat_method = knn_hnsw,
   knn_graph_method = knn_to_geodesic_graph,
   seed = 12531,
+  threads = 0L,
+  verbose = FALSE,
   ...
 )
 
@@ -21,7 +23,9 @@ cluster_graph_louvain(
   resolution = 1,
   knn_mat_method = knn_hnsw,
   knn_graph_method = knn_to_geodesic_graph,
-  seed = 12531
+  seed = 12531,
+  threads = 0L,
+  verbose = FALSE
 )
 
 cluster_graph_seurat(
@@ -29,6 +33,8 @@ cluster_graph_seurat(
   resolution = 0.8,
   knn_mat_method = knn_hnsw,
   knn_graph_method = knn_to_geodesic_graph,
+  threads = 0L,
+  verbose = FALSE,
   ...
 )
 }
@@ -49,6 +55,10 @@ Ignored if mat is already a graph adjacency matrix.}
 
 \item{seed}{Random seed for clustering initialization}
 
+\item{threads}{(integer) Number of threads to use.}
+
+\item{verbose}{(logical) Whether to print progress information during search}
+
 \item{...}{Additional arguments to underlying clustering function}
 }
 \value{
diff --git a/r/man/convert_mat_to_cluster_matrix.Rd b/r/man/convert_mat_to_cluster_matrix.Rd
index 9e42e331..c7effc26 100644
--- a/r/man/convert_mat_to_cluster_matrix.Rd
+++ b/r/man/convert_mat_to_cluster_matrix.Rd
@@ -8,7 +8,9 @@ convert_mat_to_cluster_matrix(
   mat,
   required_mat_type = c("knn", "adjacency"),
   knn_mat_method = knn_hnsw,
-  knn_graph_method = knn_to_geodesic_graph
+  knn_graph_method = knn_to_geodesic_graph,
+  threads = 1L,
+  verbose = FALSE
 )
 }
 \arguments{
diff --git a/r/man/knn_graph.Rd b/r/man/knn_graph.Rd
index 5f165b82..044dfc10 100644
--- a/r/man/knn_graph.Rd
+++ b/r/man/knn_graph.Rd
@@ -9,23 +9,28 @@
 knn_to_graph(
   knn,
   use_weights = FALSE,
+  knn_mat_method = knn_hnsw,
   self_loops = TRUE,
-  knn_mat_method = knn_hnsw
+  threads = 0L,
+  verbose = FALSE
 )
 
 knn_to_snn_graph(
   knn,
   min_val = 1/15,
   self_loops = FALSE,
+  knn_mat_method = knn_hnsw,
   return_type = c("matrix", "list"),
-  knn_mat_method = knn_hnsw
+  threads = 0L,
+  verbose = FALSE
 )
 
 knn_to_geodesic_graph(
   knn,
+  knn_mat_method = knn_hnsw,
   return_type = c("matrix", "list"),
   threads = 0L,
-  knn_mat_method = knn_hnsw
+  verbose = FALSE
 )
 }
 \arguments{
@@ -34,17 +39,19 @@ dist for cell x K neighbor distances}
 
 \item{use_weights}{boolean for whether to replace all distance weights with 1}
 
-\item{self_loops}{Whether to allow self-loops in the output graph}
-
 \item{knn_mat_method}{(function) if knn is not a knn matrix, this function will attempt to convert it to one.
 Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn matrix.}
 
+\item{self_loops}{Whether to allow self-loops in the output graph}
+
+\item{threads}{(integer) Number of threads to use.}
+
+\item{verbose}{(logical) Whether to print progress information during search}
+
 \item{min_val}{minimum jaccard index between neighbors. Values below this will
 round to 0}
 
 \item{return_type}{Whether to return a sparse adjacency matrix or an edge list}
-
-\item{threads}{Number of threads to use during calculations}
 }
 \value{
 \strong{knn_to_graph}

From 7ce2ba2b10edf52e087d0a671bad61a0202228ab Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 19 Feb 2025 18:37:31 -0800
Subject: [PATCH 100/142] [r] add ability to project into different iterations
 of `IterativeLSI()`

---
 r/R/singlecell_utils.R                   | 21 ++++++++++++---------
 r/man/IterativeLSI.Rd                    |  4 +++-
 r/tests/testthat/test-singlecell_utils.R |  3 +++
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 6dd5fec3..d49edc36 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -514,25 +514,28 @@ IterativeLSI <- function(
   return(res)
 }
 #' @rdname IterativeLSI
+#' @param iteration (integer) Which iteration of `IterativeLSI`'s features, loadings, and kept PCs to use for projection.
 #' @return
 #' **project()** IterableMatrix of the projected data of shape `(n_dimensions, ncol(mat))`.
 #' @inheritParams project
 #' @export
-project.IterativeLSI <- function(x, mat, threads = 1L, ...) {
+project.IterativeLSI <- function(x, mat, iteration = x$fitted_params$iterations, threads = 1L, ...) {
   assert_is_mat(mat)
   fitted_params <- x$fitted_params
-  # Get the final row of fitted params
-  last_iter_info <- fitted_params$iter_info[nrow(fitted_params$iter_info), ]
+  # Get the desired row of iter_info tibble
+  assert_is_wholenumber(iteration)
+  assert_true(iteration <= x$fitted_params$iterations)
+  iter_info <- fitted_params$iter_info[iteration, ]
 
   # Do a check to make sure that the fitted features all exist in input matrix
   if (!is.null(rownames(mat)) && !is.null(x$feature_names)) {
     assert_true(all(x$feature_names %in% rownames(mat)))
   }
   # Subset to variable features
-  if (is.character(last_iter_info$feature_names[[1]])) {
-    mat_indices <- which(rownames(mat) %in% last_iter_info$feature_names[[1]])
+  if (is.character(iter_info$feature_names[[1]])) {
+    mat_indices <- which(rownames(mat) %in% iter_info$feature_names[[1]])
   } else {
-    mat_indices <- last_iter_info$feature_names[[1]]
+    mat_indices <- iter_info$feature_names[[1]]
   }
   mat <- mat[mat_indices,]
   # Run LSI
@@ -550,10 +553,10 @@ project.IterativeLSI <- function(x, mat, threads = 1L, ...) {
     tempfile("mat"), compress = TRUE
   )
 
-  feature_loadings <- last_iter_info$feature_loadings[[1]]
+  feature_loadings <- iter_info$feature_loadings[[1]]
   res <- t(feature_loadings) %*% mat
-  if (length(last_iter_info$pcs_to_keep[[1]]) != nrow(res)) {
-    res <- res[last_iter_info$pcs_to_keep[[1]]$pcs_to_keep,]
+  if (length(iter_info$pcs_to_keep[[1]]) != nrow(res)) {
+    res <- res[iter_info$pcs_to_keep[[1]]$pcs_to_keep,]
   }
   return(res)
 }
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index 43bea792..db98d412 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -17,7 +17,7 @@ IterativeLSI(
   verbose = FALSE
 )
 
-\method{project}{IterativeLSI}(x, mat, threads = 1L, ...)
+\method{project}{IterativeLSI}(x, mat, iteration = x$fitted_params$iterations, threads = 1L, ...)
 }
 \arguments{
 \item{mat}{(IterableMatrix) Counts matrix of shape \verb{(features x cells)}.}
@@ -42,6 +42,8 @@ The user can pass in partial parameters to the cluster method, such as by passin
 \item{threads}{(integer) Number of threads to use.}
 
 \item{x}{DimReduction object.}
+
+\item{iteration}{(integer) Which iteration of \code{IterativeLSI}'s features, loadings, and kept PCs to use for projection.}
 }
 \value{
 \strong{IterativeLSI()} An object of class \code{c("IterativeLSI", "DimReduction")} with the following attributes:
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 970c6632..6cd5b54f 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -240,9 +240,12 @@ test_that("Iterative LSI works", {
   colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
   lsi_res_obj <- expect_no_error(IterativeLSI(mat, n_iterations = 2, n_dimensions = 10L, cluster_method = cluster_graph_louvain(knn_mat_method = knn_hnsw)))
   lsi_res_proj <- project(lsi_res_obj, mat)
+  lsi_res_proj_iter_1 <- expect_no_error(project(lsi_res_obj, mat, iteration = 1L))
   lsi_res_embedding <- lsi_res_obj$cell_embeddings
   expect_equal(ncol(lsi_res_embedding), ncol(mat))
   expect_equal(nrow(lsi_res_embedding), 10)
+  expect_equal(ncol(lsi_res_proj_iter_1), ncol(mat))
+  expect_equal(nrow(lsi_res_proj_iter_1), 10)
   expect_equal(lsi_res_embedding, lsi_res_proj)
 })
 

From 280e769a9ebc604dbb842d34a11f5c330e460be3 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 19 Feb 2025 18:46:04 -0800
Subject: [PATCH 101/142] [r] re-change knn matrix to knn object

---
 r/R/clustering.R                       | 30 +++++++++++++-------------
 r/man/cluster.Rd                       |  6 +++---
 r/man/convert_mat_to_cluster_matrix.Rd | 10 ++++-----
 r/man/is_knn_matrix.Rd                 |  6 +++---
 r/man/knn.Rd                           |  2 +-
 r/man/knn_graph.Rd                     |  4 ++--
 6 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/r/R/clustering.R b/r/R/clustering.R
index ff3fd3b4..f15c4192 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -7,12 +7,12 @@
 # except according to those terms.
 
 
-#' Check if an input is a kNN output matrix
+#' Check if an input is a kNN object
 #'
-#' knn matrix functions `knn_hnsw()` and `knn_annoy()` return a list of two matrices, `idx` and `dist`.
+#' knn object functions `knn_hnsw()` and `knn_annoy()` return a list of two matrices, `idx` and `dist`.
 #' These are used as inputs to create graph adjacency matrices for clustering.
 #' Assume any list with at least both `idx` and `dist` items is a kNN object.
-#' @return TRUE if the mat is a kNN matrix, FALSE otherwise
+#' @return TRUE if the mat is a knn object, FALSE otherwise
 #' @keywords internal
 is_knn_matrix <- function(mat) {
   return(is(mat, "list") && all(c("idx", "dist") %in% names(mat)))
@@ -36,17 +36,17 @@ is_adjacency_matrix <- function(mat) {
 #'
 #' @param mat Input matrix to be converted.
 #' @param required_mat_type (character) Required matrix type: "adjacency" or "knn".
-#' @param knn_mat_method (function) Function to convert to a knn matrix (e.g., `knn_hnsw`, `knn_annoy`).
+#' @param knn_mat_method (function) Function to convert to a knn object (e.g., `knn_hnsw`, `knn_annoy`).
 #'   Ignored if `mat` is already a knn or graph matrix.
-#' @param knn_graph_method (function) Function to convert a knn matrix to a graph matrix
+#' @param knn_graph_method (function) Function to convert a knn object to a graph matrix
 #'   (e.g., `knn_to_geodesic_graph`). Ignored if `required_mat_type` is "knn" or if
 #'   `mat` is already a graph matrix.
 #' #' @details
 #' This function checks the type of the input matrix `mat`. `mat` is returned without modification if
 #' it is already the required type (adjacency or knn).
-#' If `mat` is a standard matrix and `required_mat_type` is "knn", then `knn_mat_method` is used to convert `mat` to a knn matrix.
-#' If `required_mat_type` is "adjacency", then `knn_mat_method` is used to first convert `mat` to a knn matrix, 
-#' then `knn_graph_method` is used to convert the knn matrix to an adjacency matrix.
+#' If `mat` is a standard matrix and `required_mat_type` is "knn", then `knn_mat_method` is used to convert `mat` to a knn object.
+#' If `required_mat_type` is "adjacency", then `knn_mat_method` is used to first convert `mat` to a knn object, 
+#' then `knn_graph_method` is used to convert the knn object to an adjacency matrix.
 #' @return The converted matrix.
 #' @keywords internal
 convert_mat_to_cluster_matrix <- function(
@@ -62,7 +62,7 @@ convert_mat_to_cluster_matrix <- function(
     mat <- partial_apply(knn_mat_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
   }
   if (required_mat_type == "knn" && !is_knn_matrix(mat)) {
-    pretty_error(mat, "must be a knn matrix, or convertible to one", 1)
+    pretty_error(mat, "must be a knn object, or convertible to one", 1)
   }
   if (required_mat_type == "adjacency" && is_knn_matrix(mat)) {
     mat <- partial_apply(knn_graph_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
@@ -86,8 +86,8 @@ convert_mat_to_cluster_matrix <- function(
 #'  dist for cell x K neighbor distances
 #' @param use_weights boolean for whether to replace all distance weights with 1
 #' @param self_loops boolean for whether to allow cells to count themselves as neighbors
-#' @param knn_mat_method (function) if knn is not a knn matrix, this function will attempt to convert it to one. 
-#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn matrix.
+#' @param knn_mat_method (function) if knn is not a knn object, this function will attempt to convert it to one. 
+#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn object.
 #' @param threads (integer) Number of threads to use.
 #' @param verbose (logical) Whether to print progress information during search
 #' @return **knn_to_graph**
@@ -221,9 +221,9 @@ knn_to_geodesic_graph <- function(knn, knn_mat_method = knn_hnsw, return_type =
 #' @param resolution Resolution parameter. Higher values result in more clusters
 #' @param objective_function Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below). 
 #'    For the meaning of each option, see `igraph::cluster_leiden()`.
-#' @param knn_mat_method (function) if mat represents a regular non-knn matrix, this function will attempt to convert it a knn matrix.
-#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn matrix.
-#' @param knn_graph_method (function) if mat represents a knn matrix, this function will attempt to convert it to a graph matrix.
+#' @param knn_mat_method (function) if mat represents a regular non-knn object, this function will attempt to convert it a knn object.
+#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn object.
+#' @param knn_graph_method (function) if mat represents a knn object, this function will attempt to convert it to a graph matrix.
 #' Must be a (optionally partialized) version of `knn_to_graph()`, `knn_to_snn_graph()` or `knn_to_geodesic_graph()`.  
 #' Ignored if mat is already a graph adjacency matrix.
 #' @param seed Random seed for clustering initialization
@@ -329,7 +329,7 @@ cluster_membership_matrix <- function(groups, group_order = NULL) {
 }
 
 
-#' Get a knn matrix from reduced dimensions
+#' Get a knn object from reduced dimensions
 #'
 #' Search for approximate nearest neighbors between cells in the reduced
 #' dimensions (e.g. PCA), and return the k nearest neighbors (knn) for each
diff --git a/r/man/cluster.Rd b/r/man/cluster.Rd
index a4580925..beea9c6c 100644
--- a/r/man/cluster.Rd
+++ b/r/man/cluster.Rd
@@ -46,10 +46,10 @@ cluster_graph_seurat(
 \item{objective_function}{Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below).
 For the meaning of each option, see \code{igraph::cluster_leiden()}.}
 
-\item{knn_mat_method}{(function) if mat represents a regular non-knn matrix, this function will attempt to convert it a knn matrix.
-Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn matrix.}
+\item{knn_mat_method}{(function) if mat represents a regular non-knn object, this function will attempt to convert it a knn object.
+Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn object.}
 
-\item{knn_graph_method}{(function) if mat represents a knn matrix, this function will attempt to convert it to a graph matrix.
+\item{knn_graph_method}{(function) if mat represents a knn object, this function will attempt to convert it to a graph matrix.
 Must be a (optionally partialized) version of \code{knn_to_graph()}, \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}.
 Ignored if mat is already a graph adjacency matrix.}
 
diff --git a/r/man/convert_mat_to_cluster_matrix.Rd b/r/man/convert_mat_to_cluster_matrix.Rd
index c7effc26..92293209 100644
--- a/r/man/convert_mat_to_cluster_matrix.Rd
+++ b/r/man/convert_mat_to_cluster_matrix.Rd
@@ -18,18 +18,18 @@ convert_mat_to_cluster_matrix(
 
 \item{required_mat_type}{(character) Required matrix type: "adjacency" or "knn".}
 
-\item{knn_mat_method}{(function) Function to convert to a knn matrix (e.g., \code{knn_hnsw}, \code{knn_annoy}).
+\item{knn_mat_method}{(function) Function to convert to a knn object (e.g., \code{knn_hnsw}, \code{knn_annoy}).
 Ignored if \code{mat} is already a knn or graph matrix.}
 
-\item{knn_graph_method}{(function) Function to convert a knn matrix to a graph matrix
+\item{knn_graph_method}{(function) Function to convert a knn object to a graph matrix
 (e.g., \code{knn_to_geodesic_graph}). Ignored if \code{required_mat_type} is "knn" or if
 \code{mat} is already a graph matrix.
 #' @details
 This function checks the type of the input matrix \code{mat}. \code{mat} is returned without modification if
 it is already the required type (adjacency or knn).
-If \code{mat} is a standard matrix and \code{required_mat_type} is "knn", then \code{knn_mat_method} is used to convert \code{mat} to a knn matrix.
-If \code{required_mat_type} is "adjacency", then \code{knn_mat_method} is used to first convert \code{mat} to a knn matrix,
-then \code{knn_graph_method} is used to convert the knn matrix to an adjacency matrix.}
+If \code{mat} is a standard matrix and \code{required_mat_type} is "knn", then \code{knn_mat_method} is used to convert \code{mat} to a knn object.
+If \code{required_mat_type} is "adjacency", then \code{knn_mat_method} is used to first convert \code{mat} to a knn object,
+then \code{knn_graph_method} is used to convert the knn object to an adjacency matrix.}
 }
 \value{
 The converted matrix.
diff --git a/r/man/is_knn_matrix.Rd b/r/man/is_knn_matrix.Rd
index c3103d97..74c5db81 100644
--- a/r/man/is_knn_matrix.Rd
+++ b/r/man/is_knn_matrix.Rd
@@ -2,15 +2,15 @@
 % Please edit documentation in R/clustering.R
 \name{is_knn_matrix}
 \alias{is_knn_matrix}
-\title{Check if an input is a kNN output matrix}
+\title{Check if an input is a kNN object}
 \usage{
 is_knn_matrix(mat)
 }
 \value{
-TRUE if the mat is a kNN matrix, FALSE otherwise
+TRUE if the mat is a knn object, FALSE otherwise
 }
 \description{
-knn matrix functions \code{knn_hnsw()} and \code{knn_annoy()} return a list of two matrices, \code{idx} and \code{dist}.
+knn object functions \code{knn_hnsw()} and \code{knn_annoy()} return a list of two matrices, \code{idx} and \code{dist}.
 These are used as inputs to create graph adjacency matrices for clustering.
 Assume any list with at least both \code{idx} and \code{dist} items is a kNN object.
 }
diff --git a/r/man/knn.Rd b/r/man/knn.Rd
index 5952e69b..b92cc250 100644
--- a/r/man/knn.Rd
+++ b/r/man/knn.Rd
@@ -3,7 +3,7 @@
 \name{knn_hnsw}
 \alias{knn_hnsw}
 \alias{knn_annoy}
-\title{Get a knn matrix from reduced dimensions}
+\title{Get a knn object from reduced dimensions}
 \usage{
 knn_hnsw(
   data,
diff --git a/r/man/knn_graph.Rd b/r/man/knn_graph.Rd
index 044dfc10..bd5c031f 100644
--- a/r/man/knn_graph.Rd
+++ b/r/man/knn_graph.Rd
@@ -39,8 +39,8 @@ dist for cell x K neighbor distances}
 
 \item{use_weights}{boolean for whether to replace all distance weights with 1}
 
-\item{knn_mat_method}{(function) if knn is not a knn matrix, this function will attempt to convert it to one.
-Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn matrix.}
+\item{knn_mat_method}{(function) if knn is not a knn object, this function will attempt to convert it to one.
+Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn object.}
 
 \item{self_loops}{Whether to allow self-loops in the output graph}
 

From 3fafebb4dd2af16c76d658ed3514b8cd3744c557 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 19 Feb 2025 18:46:39 -0800
Subject: [PATCH 102/142] [r] tidy up `DimReduction` docs styling

---
 r/R/singlecell_utils.R | 14 +++++++-------
 r/man/DimReduction.Rd  |  2 +-
 r/man/IterativeLSI.Rd  |  6 +++---
 r/man/LSI.Rd           |  6 +++---
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index d49edc36..afe4b7dc 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -233,7 +233,7 @@ DimReduction <- function(mat, fitted_params = list(), feature_names = character(
 #' @param x DimReduction object.
 #' @return - `project()`: IterableMatrix object of the projected data.
 #' @details 
-#' **project()**: Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.
+#' `project()`: Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.
 #' 
 #' DimReduction subclasses should use the `project` method on new data with the same features, to project into the same latent space.
 #' All required information to run a projection should be held in `x$fitted_params`, including pertinent parameters when constructing the DimReduction subclass object.
@@ -267,7 +267,7 @@ project.default <- function(x, mat, ...) {
 #' @param scale_factor (numeric) Scaling factor to multiply matrix by prior to log normalization (see formulas below).
 #' @param threads (integer) Number of threads to use.
 #' @returns 
-#' **LSI()** An object of class `c("LSI", "DimReduction")` with the following attributes:
+#' `LSI()` An object of class `c("LSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data as a matrix of shape `(n_dimensions, ncol(mat))`
 #' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 #'   - `scale_factor`: The scale factor used for tf-idf normalization
@@ -279,7 +279,7 @@ project.default <- function(x, mat, ...) {
 #' 
 #' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
 #' - 17.1 MB memory usage, 25.1 seconds runtime
-#' @seealso `project()` `DimReduction()` `normalize_tfidf()` 
+#' @seealso `project()` `DimReduction()` `normalize_tfidf()` `normalize_log()` `svds()`
 #' @export
 LSI <- function(
   mat, n_dimensions = 50L, corr_cutoff = 1, scale_factor = 1e4,
@@ -340,7 +340,7 @@ LSI <- function(
 }
 #' @rdname LSI
 #' @return 
-#' **project()** IterableMatrix of the projected data of shape `(n_dimensions, ncol(mat))`.
+#' `project()` IterableMatrix of the projected data of shape `(n_dimensions, ncol(mat))`.
 #' @inheritParams project
 #' @export
 project.LSI <- function(x, mat, threads = 1L, ...) {
@@ -389,7 +389,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' The user can pass in partial parameters to the cluster method, such as by passing 
 #' `cluster_graph_leiden(resolution = 0.5, knn_mat_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))`.
 #' @return 
-#' **IterativeLSI()** An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
+#' `IterativeLSI()` An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data as a matrix of shape `(n_dimensions, ncol(mat))`
 #' - `fitted_params`: A list of the parameters used for iterative LSI.  Includes the following:
 #'    - `lsi_method`: The method used for LSI
@@ -422,7 +422,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' 
 #' Additionally, the ArchR implementation calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
 #' which BPCells does not encounter even with a non-subsetted matrix.
-#' @seealso `LSI()` `DimReduction()` `knn_hnsw()` `knn_annoy()` 
+#' @seealso `LSI()` `DimReduction()` `svd()` `knn_hnsw()` `knn_annoy()` 
 #' `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` `select_features_variance()` `select_features_dispersion()` 
 #' `select_features_mean()` `select_features_binned_dispersion()`
 #' @inheritParams LSI
@@ -516,7 +516,7 @@ IterativeLSI <- function(
 #' @rdname IterativeLSI
 #' @param iteration (integer) Which iteration of `IterativeLSI`'s features, loadings, and kept PCs to use for projection.
 #' @return
-#' **project()** IterableMatrix of the projected data of shape `(n_dimensions, ncol(mat))`.
+#' `project()` IterableMatrix of the projected data of shape `(n_dimensions, ncol(mat))`.
 #' @inheritParams project
 #' @export
 project.IterativeLSI <- function(x, mat, iteration = x$fitted_params$iterations, threads = 1L, ...) {
diff --git a/r/man/DimReduction.Rd b/r/man/DimReduction.Rd
index 94c8a579..9396767d 100644
--- a/r/man/DimReduction.Rd
+++ b/r/man/DimReduction.Rd
@@ -29,7 +29,7 @@ Child classes should implement a \code{project} method to allow for the projecti
 the fitted transformation object.
 }
 \details{
-\strong{project()}: Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.
+\code{project()}: Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.
 
 DimReduction subclasses should use the \code{project} method on new data with the same features, to project into the same latent space.
 All required information to run a projection should be held in \code{x$fitted_params}, including pertinent parameters when constructing the DimReduction subclass object.
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index db98d412..797d9997 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -46,7 +46,7 @@ The user can pass in partial parameters to the cluster method, such as by passin
 \item{iteration}{(integer) Which iteration of \code{IterativeLSI}'s features, loadings, and kept PCs to use for projection.}
 }
 \value{
-\strong{IterativeLSI()} An object of class \code{c("IterativeLSI", "DimReduction")} with the following attributes:
+\code{IterativeLSI()} An object of class \code{c("IterativeLSI", "DimReduction")} with the following attributes:
 \itemize{
 \item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(n_dimensions, ncol(mat))}
 \item \code{fitted_params}: A list of the parameters used for iterative LSI.  Includes the following:
@@ -66,7 +66,7 @@ The user can pass in partial parameters to the cluster method, such as by passin
 }
 }
 
-\strong{project()} IterableMatrix of the projected data of shape \verb{(n_dimensions, ncol(mat))}.
+\code{project()} IterableMatrix of the projected data of shape \verb{(n_dimensions, ncol(mat))}.
 }
 \description{
 Given a \verb{(features x cells)} counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
@@ -101,7 +101,7 @@ Additionally, the ArchR implementation calculates LSI during non-terminal iterat
 which BPCells does not encounter even with a non-subsetted matrix.
 }
 \seealso{
-\code{LSI()} \code{DimReduction()} \code{knn_hnsw()} \code{knn_annoy()}
+\code{LSI()} \code{DimReduction()} \code{svd()} \code{knn_hnsw()} \code{knn_annoy()}
 \code{cluster_graph_leiden()} \code{cluster_graph_louvain()} \code{cluster_graph_seurat()} \code{select_features_variance()} \code{select_features_dispersion()}
 \code{select_features_mean()} \code{select_features_binned_dispersion()}
 }
diff --git a/r/man/LSI.Rd b/r/man/LSI.Rd
index f41ebf4d..c576cc74 100644
--- a/r/man/LSI.Rd
+++ b/r/man/LSI.Rd
@@ -31,7 +31,7 @@ the corr_cutoff, it will be excluded from the final PCA matrix.}
 \item{x}{DimReduction object.}
 }
 \value{
-\strong{LSI()} An object of class \code{c("LSI", "DimReduction")} with the following attributes:
+\code{LSI()} An object of class \code{c("LSI", "DimReduction")} with the following attributes:
 \itemize{
 \item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(n_dimensions, ncol(mat))}
 \item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
@@ -44,7 +44,7 @@ the corr_cutoff, it will be excluded from the final PCA matrix.}
 \item \code{feature_names}: The names of the features in the matrix
 }
 
-\strong{project()} IterableMatrix of the projected data of shape \verb{(n_dimensions, ncol(mat))}.
+\code{project()} IterableMatrix of the projected data of shape \verb{(n_dimensions, ncol(mat))}.
 }
 \description{
 Given a \verb{(features x cells)} counts matrix, perform LSI, which sequentially executes tf-idf normalization and PCA to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
@@ -59,5 +59,5 @@ Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
 }
 }
 \seealso{
-\code{project()} \code{DimReduction()} \code{normalize_tfidf()}
+\code{project()} \code{DimReduction()} \code{normalize_tfidf()} \code{normalize_log()} \code{svds()}
 }

From ebe840bb9d42ed2251926e6e536fdcb5d8cafdc6 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 19 Feb 2025 18:50:18 -0800
Subject: [PATCH 103/142] [r] change `is_knn_matrix()` to `is_knn_object()`

---
 r/R/clustering.R                             | 6 +++---
 r/man/{is_knn_matrix.Rd => is_knn_object.Rd} | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)
 rename r/man/{is_knn_matrix.Rd => is_knn_object.Rd} (89%)

diff --git a/r/R/clustering.R b/r/R/clustering.R
index f15c4192..e96af24e 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -14,7 +14,7 @@
 #' Assume any list with at least both `idx` and `dist` items is a kNN object.
 #' @return TRUE if the mat is a knn object, FALSE otherwise
 #' @keywords internal
-is_knn_matrix <- function(mat) {
+is_knn_object <- function(mat) {
   return(is(mat, "list") && all(c("idx", "dist") %in% names(mat)))
 }
 
@@ -61,10 +61,10 @@ convert_mat_to_cluster_matrix <- function(
   if (is(mat, "matrix")) {
     mat <- partial_apply(knn_mat_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
   }
-  if (required_mat_type == "knn" && !is_knn_matrix(mat)) {
+  if (required_mat_type == "knn" && !is_knn_object(mat)) {
     pretty_error(mat, "must be a knn object, or convertible to one", 1)
   }
-  if (required_mat_type == "adjacency" && is_knn_matrix(mat)) {
+  if (required_mat_type == "adjacency" && is_knn_object(mat)) {
     mat <- partial_apply(knn_graph_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
   }
   if (required_mat_type == "adjacency" && !is_adjacency_matrix(mat)) {
diff --git a/r/man/is_knn_matrix.Rd b/r/man/is_knn_object.Rd
similarity index 89%
rename from r/man/is_knn_matrix.Rd
rename to r/man/is_knn_object.Rd
index 74c5db81..5713e23f 100644
--- a/r/man/is_knn_matrix.Rd
+++ b/r/man/is_knn_object.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/clustering.R
-\name{is_knn_matrix}
-\alias{is_knn_matrix}
+\name{is_knn_object}
+\alias{is_knn_object}
 \title{Check if an input is a kNN object}
 \usage{
-is_knn_matrix(mat)
+is_knn_object(mat)
 }
 \value{
 TRUE if the mat is a knn object, FALSE otherwise

From 1dfeb15a0fe7e4bda6d32a5619fb4a9d96d2c7f6 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 20 Feb 2025 15:46:00 -0800
Subject: [PATCH 104/142] [r] change docs styling for `IterativeLSI()`

---
 r/R/singlecell_utils.R | 15 +++++++++------
 r/man/IterativeLSI.Rd  | 18 +++++++++++-------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index afe4b7dc..47ab97bb 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -415,13 +415,16 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #'    - If this is the final iteration, return the projected data from this PCA projection
 #'    - Else, cluster the LSI results using `cluster_method`
 #' 
-#' There are some minor differences when compared to the ArchR implementation.  Firstly, the ArchR implementation uses a different method for selecting features in the first iteration.  
-#' `select_features_mean(normalize = binarize)` can be passed in for the `feature_selection_method` argument to mimic the ArchR implementation, if choosing to only run one iteration.  This function
-#' currently does not support utilization of different feature selection methods across each iteration.  If one desires to use a different feature selection method for each iteration, 
-#' they can take the cluster assignments from the previous iteration and use them to select features and run LSI.
+#' There are some minor differences when compared to the ArchR implementation:
+#' - The ArchR implementation uses a different method for selecting features in the first iteration.  The default method is `select_features_variance`, which is the same as the ArchR implementation.
+#' `select_features_mean(normalize_method = binarize)` can be passed in for the `feature_selection_method` argument to mimic the ArchR implementation, if choosing to only run one iteration.
+#' Firstly, the ArchR implementation uses a different method for selecting features in the first iteration.  This function currently does not support utilization of different feature selection methods across each iteration.  
+#' If one desires to use a different feature selection method for each iteration, they can take the cluster assignments from the previous iteration and use them to select features and run LSI.
+#' - The ArchR implementation calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
+#' which BPCells does not encounter even with a non-subsetted matrix. Therefore, IterativeLSI will run LSI on the entire matrix for each iteraiton.
+#' - The ArchR implementation defaults on using Seurat clustering for default, which utilizes the Louvain algorithm (See `Seurat::FindClusters()`).  In constrast, `IterativeLSI()` utilizes
+#' leiden, which should provide the same clustering results while being faster.
 #' 
-#' Additionally, the ArchR implementation calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
-#' which BPCells does not encounter even with a non-subsetted matrix.
 #' @seealso `LSI()` `DimReduction()` `svd()` `knn_hnsw()` `knn_annoy()` 
 #' `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` `select_features_variance()` `select_features_dispersion()` 
 #' `select_features_mean()` `select_features_binned_dispersion()`
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index 797d9997..fc62193c 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -92,13 +92,17 @@ The Iterative LSI method is as follows:
 }
 }
 
-There are some minor differences when compared to the ArchR implementation.  Firstly, the ArchR implementation uses a different method for selecting features in the first iteration.
-\code{select_features_mean(normalize = binarize)} can be passed in for the \code{feature_selection_method} argument to mimic the ArchR implementation, if choosing to only run one iteration.  This function
-currently does not support utilization of different feature selection methods across each iteration.  If one desires to use a different feature selection method for each iteration,
-they can take the cluster assignments from the previous iteration and use them to select features and run LSI.
-
-Additionally, the ArchR implementation calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
-which BPCells does not encounter even with a non-subsetted matrix.
+There are some minor differences when compared to the ArchR implementation:
+\itemize{
+\item The ArchR implementation uses a different method for selecting features in the first iteration.  The default method is \code{select_features_variance}, which is the same as the ArchR implementation.
+\code{select_features_mean(normalize_method = binarize)} can be passed in for the \code{feature_selection_method} argument to mimic the ArchR implementation, if choosing to only run one iteration.
+Firstly, the ArchR implementation uses a different method for selecting features in the first iteration.  This function currently does not support utilization of different feature selection methods across each iteration.
+If one desires to use a different feature selection method for each iteration, they can take the cluster assignments from the previous iteration and use them to select features and run LSI.
+\item The ArchR implementation calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
+which BPCells does not encounter even with a non-subsetted matrix. Therefore, IterativeLSI will run LSI on the entire matrix for each iteraiton.
+\item The ArchR implementation defaults on using Seurat clustering for default, which utilizes the Louvain algorithm (See \code{Seurat::FindClusters()}).  In constrast, \code{IterativeLSI()} utilizes
+leiden, which should provide the same clustering results while being faster.
+}
 }
 \seealso{
 \code{LSI()} \code{DimReduction()} \code{svd()} \code{knn_hnsw()} \code{knn_annoy()}

From d15a0e552840dc055883626a459697c596f28cbe Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 20 Feb 2025 16:05:53 -0800
Subject: [PATCH 105/142] [r] transpose cell emebeddings in  `DimReduction`

---
 r/R/singlecell_utils.R                   | 30 ++++++++++++------------
 r/man/DimReduction.Rd                    |  2 +-
 r/man/IterativeLSI.Rd                    |  4 ++--
 r/man/LSI.Rd                             |  2 +-
 r/tests/testthat/test-singlecell_utils.R | 20 ++++++++--------
 5 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 47ab97bb..50759bc6 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -211,7 +211,7 @@ select_features_binned_dispersion <- function(
 #' the fitted transformation object.
 #' @rdname DimReduction
 #' @param mat (IterableMatrix) Input matrix of shape `(features x cells)`.
-#' @field cell_embeddings (IterableMatrix, dgCMatrix, matrix) Projected data of shape `(n_dimesions x n_cells)` of the original matrix after a dimensionality reduction.
+#' @field cell_embeddings (IterableMatrix, dgCMatrix, matrix) Projected data of shape `(cells x n_dimensions)` of the original matrix after a dimensionality reduction.
 #' @field fitted_params (list) A list of parameters used for the transformation of a matrix.  This should include all necessary information to project new data with the same features.
 #' @field feature_names (character) The names of the features that this DimReduction object was fit on.  Matrices to be projected should have the same feature names.
 #' @return - `DimReduction()`: DimReduction object.
@@ -268,7 +268,7 @@ project.default <- function(x, mat, ...) {
 #' @param threads (integer) Number of threads to use.
 #' @returns 
 #' `LSI()` An object of class `c("LSI", "DimReduction")` with the following attributes:
-#' - `cell_embeddings`: The projected data as a matrix of shape `(n_dimensions, ncol(mat))`
+#' - `cell_embeddings`: The projected data as a matrix of shape `(cells, n_dimensions)`
 #' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 #'   - `scale_factor`: The scale factor used for tf-idf normalization
 #'   - `feature_means`: The means of the features used for normalization
@@ -315,14 +315,14 @@ LSI <- function(
   # Run pca
   if (verbose) log_progress("Calculating SVD")
   svd_attr <- svds(mat, k = n_dimensions, threads = threads)
-  pca_res <- t(svd_attr$u) %*% mat
+  pca_res <- t(mat) %*% svd_attr$u
   
   # Filter out PCs that are highly correlated with sequencing depth
-  pca_corrs <- abs(cor(read_depth, t(pca_res)))
+  pca_corrs <- abs(cor(read_depth, pca_res))
   pca_feats_to_keep <- which(pca_corrs < corr_cutoff)
   if (length(pca_feats_to_keep) != n_dimensions) {
     if (verbose) log_progress(sprintf("Dropping PCs %s due to high correlation with sequencing depth", paste(setdiff(1:n_dimensions, pca_feats_to_keep), collapse = ", ")))
-    pca_res <- pca_res[pca_feats_to_keep, ]
+    pca_res <- pca_res[, pca_feats_to_keep]
   }
   fitted_params <- list(
     scale_factor = scale_factor,
@@ -366,9 +366,9 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
     tempfile("mat"), compress = TRUE
   )
   feature_loadings <- fitted_params$feature_loadings
-  res <- t(feature_loadings) %*% mat
-  if (length(fitted_params$pcs_to_keep) != nrow(res)) {
-    res <- res[fitted_params$pcs_to_keep, ]
+  res <- t(mat) %*% feature_loadings
+  if (length(fitted_params$pcs_to_keep) != ncol(res)) {
+    res <- res[, fitted_params$pcs_to_keep]
   }
   return(res)
 }
@@ -390,7 +390,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' `cluster_graph_leiden(resolution = 0.5, knn_mat_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))`.
 #' @return 
 #' `IterativeLSI()` An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
-#' - `cell_embeddings`: The projected data as a matrix of shape `(n_dimensions, ncol(mat))`
+#' - `cell_embeddings`: The projected data as a matrix of shape `(cells, n_dimensions)`
 #' - `fitted_params`: A list of the parameters used for iterative LSI.  Includes the following:
 #'    - `lsi_method`: The method used for LSI
 #'    - `cluster_method`: The method used for clustering
@@ -482,7 +482,7 @@ IterativeLSI <- function(
     if (verbose) log_progress("Running LSI")
 
     lsi_res_obj <- LSI(
-      mat = mat[mat_indices,],
+      mat = mat[mat_indices, ],
       n_dimensions = n_dimensions,
       scale_factor = scale_factor,
       threads = threads,
@@ -499,7 +499,7 @@ IterativeLSI <- function(
     
     # cluster the LSI results
     if (verbose) log_progress("Clustering LSI results")
-    clustering_res <- t(lsi_res_obj$cell_embeddings[lsi_res_obj$fitted_params$pcs_to_keep, ]) %>%
+    clustering_res <- lsi_res_obj$cell_embeddings[, lsi_res_obj$fitted_params$pcs_to_keep] %>%
       partial_apply(cluster_method, threads = threads, .missing_args_error = FALSE)()
     fitted_params$iter_info$clusters[[i]] <- clustering_res
     # pseudobulk and pass onto next iteration
@@ -519,7 +519,7 @@ IterativeLSI <- function(
 #' @rdname IterativeLSI
 #' @param iteration (integer) Which iteration of `IterativeLSI`'s features, loadings, and kept PCs to use for projection.
 #' @return
-#' `project()` IterableMatrix of the projected data of shape `(n_dimensions, ncol(mat))`.
+#' `project()` IterableMatrix of the projected data of shape `(cells, n_dimensions)`.
 #' @inheritParams project
 #' @export
 project.IterativeLSI <- function(x, mat, iteration = x$fitted_params$iterations, threads = 1L, ...) {
@@ -557,9 +557,9 @@ project.IterativeLSI <- function(x, mat, iteration = x$fitted_params$iterations,
   )
 
   feature_loadings <- iter_info$feature_loadings[[1]]
-  res <- t(feature_loadings) %*% mat
-  if (length(iter_info$pcs_to_keep[[1]]) != nrow(res)) {
-    res <- res[iter_info$pcs_to_keep[[1]]$pcs_to_keep,]
+  res <-  t(mat) %*% feature_loadings
+  if (length(iter_info$pcs_to_keep[[1]]) != ncol(res)) {
+    res <- res[, iter_info$pcs_to_keep[[1]]]
   }
   return(res)
 }
diff --git a/r/man/DimReduction.Rd b/r/man/DimReduction.Rd
index 9396767d..73943cd5 100644
--- a/r/man/DimReduction.Rd
+++ b/r/man/DimReduction.Rd
@@ -40,7 +40,7 @@ If there are rownames, reorder the matrix to match the order of the original mat
 \section{Fields}{
 
 \describe{
-\item{\code{cell_embeddings}}{(IterableMatrix, dgCMatrix, matrix) Projected data of shape \verb{(n_dimesions x n_cells)} of the original matrix after a dimensionality reduction.}
+\item{\code{cell_embeddings}}{(IterableMatrix, dgCMatrix, matrix) Projected data of shape \verb{(cells x n_dimensions)} of the original matrix after a dimensionality reduction.}
 
 \item{\code{fitted_params}}{(list) A list of parameters used for the transformation of a matrix.  This should include all necessary information to project new data with the same features.}
 
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index fc62193c..74336a76 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -48,7 +48,7 @@ The user can pass in partial parameters to the cluster method, such as by passin
 \value{
 \code{IterativeLSI()} An object of class \code{c("IterativeLSI", "DimReduction")} with the following attributes:
 \itemize{
-\item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(n_dimensions, ncol(mat))}
+\item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(cells, n_dimensions)}
 \item \code{fitted_params}: A list of the parameters used for iterative LSI.  Includes the following:
 \itemize{
 \item \code{lsi_method}: The method used for LSI
@@ -66,7 +66,7 @@ The user can pass in partial parameters to the cluster method, such as by passin
 }
 }
 
-\code{project()} IterableMatrix of the projected data of shape \verb{(n_dimensions, ncol(mat))}.
+\code{project()} IterableMatrix of the projected data of shape \verb{(cells, n_dimensions)}.
 }
 \description{
 Given a \verb{(features x cells)} counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
diff --git a/r/man/LSI.Rd b/r/man/LSI.Rd
index c576cc74..72bb2430 100644
--- a/r/man/LSI.Rd
+++ b/r/man/LSI.Rd
@@ -33,7 +33,7 @@ the corr_cutoff, it will be excluded from the final PCA matrix.}
 \value{
 \code{LSI()} An object of class \code{c("LSI", "DimReduction")} with the following attributes:
 \itemize{
-\item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(n_dimensions, ncol(mat))}
+\item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(cells, n_dimensions)}
 \item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 \itemize{
 \item \code{scale_factor}: The scale factor used for tf-idf normalization
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 6cd5b54f..c7404cf4 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -226,10 +226,10 @@ test_that("LSI works", {
     # Check that projection results in the same output if used on the same input matrix
     lsi_res_proj <- project(lsi_res_obj, mat)
 
-    expect_equal(nrow(lsi_res), 5)
-    expect_equal(ncol(lsi_res), ncol(mat))
-    expect_equal(nrow(lsi_res_t), 5)
-    expect_equal(ncol(lsi_res_t), nrow(mat))
+    expect_equal(ncol(lsi_res), 5)
+    expect_equal(nrow(lsi_res), ncol(mat))
+    expect_equal(ncol(lsi_res_t), 5)
+    expect_equal(nrow(lsi_res_t), nrow(mat))
     expect_equal(lsi_res, lsi_res_proj)
 })
 
@@ -242,10 +242,10 @@ test_that("Iterative LSI works", {
   lsi_res_proj <- project(lsi_res_obj, mat)
   lsi_res_proj_iter_1 <- expect_no_error(project(lsi_res_obj, mat, iteration = 1L))
   lsi_res_embedding <- lsi_res_obj$cell_embeddings
-  expect_equal(ncol(lsi_res_embedding), ncol(mat))
-  expect_equal(nrow(lsi_res_embedding), 10)
-  expect_equal(ncol(lsi_res_proj_iter_1), ncol(mat))
-  expect_equal(nrow(lsi_res_proj_iter_1), 10)
+  expect_equal(nrow(lsi_res_embedding), ncol(mat))
+  expect_equal(ncol(lsi_res_embedding), 10)
+  expect_equal(nrow(lsi_res_proj_iter_1), ncol(mat))
+  expect_equal(ncol(lsi_res_proj_iter_1), 10)
   expect_equal(lsi_res_embedding, lsi_res_proj)
 })
 
@@ -265,7 +265,7 @@ test_that("Iterative LSI works with parameterized clustering", {
   )
   lsi_res_proj <- project(lsi_res_obj, mat)
   lsi_res_embedding <- lsi_res_obj$cell_embeddings
-  expect_equal(ncol(lsi_res_embedding), ncol(mat))
-  expect_equal(nrow(lsi_res_embedding), 10)
+  expect_equal(nrow(lsi_res_embedding), ncol(mat))
+  expect_equal(ncol(lsi_res_embedding), 10)
   expect_equal(lsi_res_embedding, lsi_res_proj)
 })
\ No newline at end of file

From df6406957b7e4d5a067d2ca7e923b8c0b7f98275 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 20 Feb 2025 16:23:57 -0800
Subject: [PATCH 106/142] [r] add pc removal tests for `LSI()`, make `LSI()`
 work when only one pc is kept

---
 r/R/singlecell_utils.R                   |  5 +++--
 r/tests/testthat/test-singlecell_utils.R | 10 ++++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 50759bc6..7cfa2384 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -322,7 +322,7 @@ LSI <- function(
   pca_feats_to_keep <- which(pca_corrs < corr_cutoff)
   if (length(pca_feats_to_keep) != n_dimensions) {
     if (verbose) log_progress(sprintf("Dropping PCs %s due to high correlation with sequencing depth", paste(setdiff(1:n_dimensions, pca_feats_to_keep), collapse = ", ")))
-    pca_res <- pca_res[, pca_feats_to_keep]
+    pca_res <- pca_res[, pca_feats_to_keep] %>% as.matrix()
   }
   fitted_params <- list(
     scale_factor = scale_factor,
@@ -368,7 +368,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
   feature_loadings <- fitted_params$feature_loadings
   res <- t(mat) %*% feature_loadings
   if (length(fitted_params$pcs_to_keep) != ncol(res)) {
-    res <- res[, fitted_params$pcs_to_keep]
+    res <- res[, fitted_params$pcs_to_keep] %>% as.matrix()
   }
   return(res)
 }
@@ -500,6 +500,7 @@ IterativeLSI <- function(
     # cluster the LSI results
     if (verbose) log_progress("Clustering LSI results")
     clustering_res <- lsi_res_obj$cell_embeddings[, lsi_res_obj$fitted_params$pcs_to_keep] %>%
+      as.matrix() %>%
       partial_apply(cluster_method, threads = threads, .missing_args_error = FALSE)()
     fitted_params$iter_info$clusters[[i]] <- clustering_res
     # pseudobulk and pass onto next iteration
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index c7404cf4..dc46e392 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -215,21 +215,27 @@ test_that("Feature selection by bin variance works", {
 })
 
 test_that("LSI works", {
+    set.seed(12345)
     mat <- matrix(runif(240), nrow=10) %>% as("dgCMatrix") %>% as("IterableMatrix")
     rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
     colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
     # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to ArchR
-    lsi_res_obj <- LSI(mat, n_dimensions = 5)
-    lsi_res_t_obj <- LSI(t(mat), n_dimensions = 5)
+    n_dimensions <- 5
+    lsi_res_obj <- LSI(mat, n_dimensions = n_dimensions)
+    lsi_res_t_obj <- LSI(t(mat), n_dimensions = n_dimensions)
     lsi_res <- lsi_res_obj$cell_embeddings
     lsi_res_t <- lsi_res_t_obj$cell_embeddings
     # Check that projection results in the same output if used on the same input matrix
     lsi_res_proj <- project(lsi_res_obj, mat)
+    # Check setting pca correlations to non-1 value
+    lsi_res_obj_corr <- LSI(mat, n_dimensions = n_dimensions, corr_cutoff = 0.2)
 
     expect_equal(ncol(lsi_res), 5)
     expect_equal(nrow(lsi_res), ncol(mat))
     expect_equal(ncol(lsi_res_t), 5)
     expect_equal(nrow(lsi_res_t), nrow(mat))
+    expect_equal(nrow(lsi_res_proj), ncol(mat))
+    expect_lt(ncol(lsi_res_obj_corr$cell_embeddings), n_dimensions)
     expect_equal(lsi_res, lsi_res_proj)
 })
 

From 6c248e083285d50b34cf918c8c6c0078af258507 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 20 Feb 2025 20:53:04 -0800
Subject: [PATCH 107/142] [r] expand `IterativeLSI()` docs

---
 r/R/singlecell_utils.R | 2 ++
 r/man/IterativeLSI.Rd  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 7cfa2384..e4380eea 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -424,6 +424,8 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' which BPCells does not encounter even with a non-subsetted matrix. Therefore, IterativeLSI will run LSI on the entire matrix for each iteraiton.
 #' - The ArchR implementation defaults on using Seurat clustering for default, which utilizes the Louvain algorithm (See `Seurat::FindClusters()`).  In constrast, `IterativeLSI()` utilizes
 #' leiden, which should provide the same clustering results while being faster.
+#' - The ArchR implementation also plots a umap of every iteration's dimensionality reduction.  While this is not implemented in `IterativeLSI()`,
+#' one can use the `project()` method with the `iteration` argument set to the desired iteration to get projected data.  This can then be fed into `uwot::umap()`
 #' 
 #' @seealso `LSI()` `DimReduction()` `svd()` `knn_hnsw()` `knn_annoy()` 
 #' `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` `select_features_variance()` `select_features_dispersion()` 
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index 74336a76..324c4314 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -102,6 +102,8 @@ If one desires to use a different feature selection method for each iteration, t
 which BPCells does not encounter even with a non-subsetted matrix. Therefore, IterativeLSI will run LSI on the entire matrix for each iteraiton.
 \item The ArchR implementation defaults on using Seurat clustering for default, which utilizes the Louvain algorithm (See \code{Seurat::FindClusters()}).  In constrast, \code{IterativeLSI()} utilizes
 leiden, which should provide the same clustering results while being faster.
+\item The ArchR implementation also plots a umap of every iteration's dimensionality reduction.  While this is not implemented in \code{IterativeLSI()},
+one can use the \code{project()} method with the \code{iteration} argument set to the desired iteration to get projected data.  This can then be fed into \code{uwot::umap()}
 }
 }
 \seealso{

From 4ce1c61d81a3ade2f4aa6032d5b76a9446b272f7 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Sun, 23 Feb 2025 23:18:40 -0800
Subject: [PATCH 108/142] [r] general docs and code cleanup

---
 r/R/clustering.R                       |  8 +++----
 r/R/singlecell_utils.R                 | 33 +++++++++++++++-----------
 r/R/transforms.R                       |  2 +-
 r/man/DimReduction.Rd                  | 11 ++++++---
 r/man/IterativeLSI.Rd                  |  2 +-
 r/man/LSI.Rd                           |  3 ++-
 r/man/call_macs_peaks.Rd               | 14 +++++++++++
 r/man/convert_mat_to_cluster_matrix.Rd |  8 +++----
 r/man/feature_selection.Rd             | 13 ++++++----
 r/man/normalize.Rd                     |  2 +-
 10 files changed, 62 insertions(+), 34 deletions(-)
 create mode 100644 r/man/call_macs_peaks.Rd

diff --git a/r/R/clustering.R b/r/R/clustering.R
index e96af24e..587574b8 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -36,9 +36,9 @@ is_adjacency_matrix <- function(mat) {
 #'
 #' @param mat Input matrix to be converted.
 #' @param required_mat_type (character) Required matrix type: "adjacency" or "knn".
-#' @param knn_mat_method (function) Function to convert to a knn object (e.g., `knn_hnsw`, `knn_annoy`).
-#'   Ignored if `mat` is already a knn or graph matrix.
-#' @param knn_graph_method (function) Function to convert a knn object to a graph matrix
+#' @param knn_mat_method (function) Function to convert input to a knn object (e.g., `knn_hnsw`, `knn_annoy`).
+#'   Ignored if `mat` is already a knn object or graph matrix.
+#' @param knn_graph_method (function) Function to convert a knn object to a graph adjacency matrix
 #'   (e.g., `knn_to_geodesic_graph`). Ignored if `required_mat_type` is "knn" or if
 #'   `mat` is already a graph matrix.
 #' #' @details
@@ -46,7 +46,7 @@ is_adjacency_matrix <- function(mat) {
 #' it is already the required type (adjacency or knn).
 #' If `mat` is a standard matrix and `required_mat_type` is "knn", then `knn_mat_method` is used to convert `mat` to a knn object.
 #' If `required_mat_type` is "adjacency", then `knn_mat_method` is used to first convert `mat` to a knn object, 
-#' then `knn_graph_method` is used to convert the knn object to an adjacency matrix.
+#' then `knn_graph_method` is used to convert the knn object to a graph adjacency matrix.
 #' @return The converted matrix.
 #' @keywords internal
 convert_mat_to_cluster_matrix <- function(
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index e4380eea..9d176da5 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -13,13 +13,16 @@
 
 #' Feature selection functions
 #' 
-#' Apply a feature selection method to a non-normalized `(features x cells)` matrix.  We recommend using counts matrices as input and 
-#' apply any normalizations prior to feature selection via the normalize argument (if available).  Instead of directly subsetting the input matrix,
-#' an output dataframe is provided indicating which features are highly variable, and the scoring of each feature.
+#' Apply a feature selection method to a non-normalized `(features x cells)` matrix. 
+#' 
+#'  We recommend using counts matrices as input and to
+#' apply any normalizations prior to feature selection via the normalize argument (if available).  
+#' Instead of directly subsetting the input matrix,
+#' an output dataframe is returned, indicating which features are highly variable, and the scoring of each feature.
 #' @rdname feature_selection
-#' @param mat (IterableMatrix) dimensions features x cells
+#' @param mat (IterableMatrix) Counts matrix with dimensions `(features x cells)`.
 #' @param num_feats (float) Number of features to mark as highly_variable. If 0 < num_feats < 1, then interpret it as a fraction of features.
-#' @param normalize_method (function) Normalize the matrix prior to feature selection by calling normalize(mat) if it's not NULL. 
+#' @param normalize_method (function) Used to normalize the matrix prior to feature selection by calling `normalize_method(mat)` if it is not NULL. 
 #' For example, pass normalize_log() or normalize_tfidf(). 
 #' If the normalize function accepts a threads argument, that will passed as normalize(mat, threads=threads).
 #' @param threads (integer) Number of threads to use.
@@ -186,7 +189,7 @@ select_features_binned_dispersion <- function(
   )
   # Bin by mean, and normalize dispersion with each bin
   features_df <- features_df %>%
-    dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% 
+    dplyr::mutate(bin = cut(mean, n_bins, labels = FALSE)) %>% 
     dplyr::group_by(bin) %>%
     dplyr::mutate(
       score = (dispersion - mean(dispersion)) / sd(dispersion),
@@ -210,17 +213,16 @@ select_features_binned_dispersion <- function(
 #' Child classes should implement a `project` method to allow for the projection of other matrices using
 #' the fitted transformation object.
 #' @rdname DimReduction
-#' @param mat (IterableMatrix) Input matrix of shape `(features x cells)`.
 #' @field cell_embeddings (IterableMatrix, dgCMatrix, matrix) Projected data of shape `(cells x n_dimensions)` of the original matrix after a dimensionality reduction.
 #' @field fitted_params (list) A list of parameters used for the transformation of a matrix.  This should include all necessary information to project new data with the same features.
 #' @field feature_names (character) The names of the features that this DimReduction object was fit on.  Matrices to be projected should have the same feature names.
 #' @return - `DimReduction()`: DimReduction object.
 #' @export
-DimReduction <- function(mat, fitted_params = list(), feature_names = character(0), ...) {
-  assert_is(mat, c("IterableMatrix", "dgCMatrix", "matrix"))
+DimReduction <- function(cell_embeddings, fitted_params = list(), feature_names = character(0), ...) {
+  assert_is(cell_embeddings, c("IterableMatrix", "dgCMatrix", "matrix"))
   assert_is(fitted_params, "list")
   structure(list(
-    cell_embeddings = mat,
+    cell_embeddings = cell_embeddings,
     fitted_params = fitted_params,
     feature_names = feature_names,
     ...
@@ -230,6 +232,7 @@ DimReduction <- function(mat, fitted_params = list(), feature_names = character(
 }
 
 #' @rdname DimReduction
+#' @param (IterableMatrix) Input matrix of shape `(features x cells)`.
 #' @param x DimReduction object.
 #' @return - `project()`: IterableMatrix object of the projected data.
 #' @details 
@@ -257,7 +260,8 @@ project.default <- function(x, mat, ...) {
 
 #' Perform latent semantic indexing (LSI) on a matrix.
 #' 
-#' Given a `(features x cells)` counts matrix, perform LSI, which sequentially executes tf-idf normalization and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
+#' Given a `(features x cells)` counts matrix, perform LSI, which sequentially executes tf-idf normalization and PCA to create a latent space representation 
+#' of the matrix of shape `(n_dimensions, ncol(mat))`.
 #' Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space.
 #' @rdname LSI
 #' @param mat (IterableMatrix) Counts matrix of shape `(features x cells)`.
@@ -331,7 +335,7 @@ LSI <- function(
     feature_loadings = svd_attr$u
   )
   res <- DimReduction(
-    mat = pca_res,
+    cell_embeddings = pca_res,
     fitted_params = fitted_params,
     feature_names = rownames(mat)
   )
@@ -512,7 +516,7 @@ IterativeLSI <- function(
   }
   if (verbose) log_progress("Finished running Iterative LSI")
   res <- DimReduction(
-    mat = lsi_res_obj$cell_embeddings,
+    cell_embeddings = lsi_res_obj$cell_embeddings,
     fitted_params = fitted_params,
     feature_names = rownames(mat)
   )
@@ -520,13 +524,14 @@ IterativeLSI <- function(
   return(res)
 }
 #' @rdname IterativeLSI
-#' @param iteration (integer) Which iteration of `IterativeLSI`'s features, loadings, and kept PCs to use for projection.
+#' @param iteration (integer) Which iteration of `IterativeLSI`'s features and loadings to use for projection.
 #' @return
 #' `project()` IterableMatrix of the projected data of shape `(cells, n_dimensions)`.
 #' @inheritParams project
 #' @export
 project.IterativeLSI <- function(x, mat, iteration = x$fitted_params$iterations, threads = 1L, ...) {
   assert_is_mat(mat)
+  assert_is_wholenumber(threads)
   fitted_params <- x$fitted_params
   # Get the desired row of iter_info tibble
   assert_is_wholenumber(iteration)
diff --git a/r/R/transforms.R b/r/R/transforms.R
index 6a32ccb3..43342e98 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -948,7 +948,7 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #' Apply standard normalizations to a `(features x cells)` counts matrix.
 #' 
 #' @rdname normalize
-#' @param mat (IterableMatrix) Counts matrix to normalize. `(features x cells)`
+#' @param mat (IterableMatrix) Counts matrix with dimensions `(features x cells)`.
 #' @param scale_factor (numeric) Scaling factor to multiply matrix by prior to normalization (see formulas below).
 #' @param threads (integer) Number of threads to use.
 #' @returns For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, 
diff --git a/r/man/DimReduction.Rd b/r/man/DimReduction.Rd
index 73943cd5..17ee89ce 100644
--- a/r/man/DimReduction.Rd
+++ b/r/man/DimReduction.Rd
@@ -5,14 +5,19 @@
 \alias{project}
 \title{Barebones definition of a DimReduction class.}
 \usage{
-DimReduction(mat, fitted_params = list(), feature_names = character(0), ...)
+DimReduction(
+  cell_embeddings,
+  fitted_params = list(),
+  feature_names = character(0),
+  ...
+)
 
 project(x, mat, ...)
 }
 \arguments{
-\item{mat}{(IterableMatrix) Input matrix of shape \verb{(features x cells)}.}
-
 \item{x}{DimReduction object.}
+
+\item{(IterableMatrix)}{Input matrix of shape \verb{(features x cells)}.}
 }
 \value{
 \itemize{
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index 324c4314..aad46eeb 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -43,7 +43,7 @@ The user can pass in partial parameters to the cluster method, such as by passin
 
 \item{x}{DimReduction object.}
 
-\item{iteration}{(integer) Which iteration of \code{IterativeLSI}'s features, loadings, and kept PCs to use for projection.}
+\item{iteration}{(integer) Which iteration of \code{IterativeLSI}'s features and loadings to use for projection.}
 }
 \value{
 \code{IterativeLSI()} An object of class \code{c("IterativeLSI", "DimReduction")} with the following attributes:
diff --git a/r/man/LSI.Rd b/r/man/LSI.Rd
index 72bb2430..ec3d1039 100644
--- a/r/man/LSI.Rd
+++ b/r/man/LSI.Rd
@@ -47,7 +47,8 @@ the corr_cutoff, it will be excluded from the final PCA matrix.}
 \code{project()} IterableMatrix of the projected data of shape \verb{(n_dimensions, ncol(mat))}.
 }
 \description{
-Given a \verb{(features x cells)} counts matrix, perform LSI, which sequentially executes tf-idf normalization and PCA to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
+Given a \verb{(features x cells)} counts matrix, perform LSI, which sequentially executes tf-idf normalization and PCA to create a latent space representation
+of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
 Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space.
 }
 \details{
diff --git a/r/man/call_macs_peaks.Rd b/r/man/call_macs_peaks.Rd
new file mode 100644
index 00000000..5ad425c5
--- /dev/null
+++ b/r/man/call_macs_peaks.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/atac_utils.R
+\name{call_macs_peaks}
+\alias{call_macs_peaks}
+\title{Call peaks using MACS2/3}
+\usage{
+call_macs_peaks(...)
+}
+\description{
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}
+
+This function has been renamed to \code{call_peaks_macs()}
+}
+\keyword{internal}
diff --git a/r/man/convert_mat_to_cluster_matrix.Rd b/r/man/convert_mat_to_cluster_matrix.Rd
index 92293209..c00d7bdd 100644
--- a/r/man/convert_mat_to_cluster_matrix.Rd
+++ b/r/man/convert_mat_to_cluster_matrix.Rd
@@ -18,10 +18,10 @@ convert_mat_to_cluster_matrix(
 
 \item{required_mat_type}{(character) Required matrix type: "adjacency" or "knn".}
 
-\item{knn_mat_method}{(function) Function to convert to a knn object (e.g., \code{knn_hnsw}, \code{knn_annoy}).
-Ignored if \code{mat} is already a knn or graph matrix.}
+\item{knn_mat_method}{(function) Function to convert input to a knn object (e.g., \code{knn_hnsw}, \code{knn_annoy}).
+Ignored if \code{mat} is already a knn object or graph matrix.}
 
-\item{knn_graph_method}{(function) Function to convert a knn object to a graph matrix
+\item{knn_graph_method}{(function) Function to convert a knn object to a graph adjacency matrix
 (e.g., \code{knn_to_geodesic_graph}). Ignored if \code{required_mat_type} is "knn" or if
 \code{mat} is already a graph matrix.
 #' @details
@@ -29,7 +29,7 @@ This function checks the type of the input matrix \code{mat}. \code{mat} is retu
 it is already the required type (adjacency or knn).
 If \code{mat} is a standard matrix and \code{required_mat_type} is "knn", then \code{knn_mat_method} is used to convert \code{mat} to a knn object.
 If \code{required_mat_type} is "adjacency", then \code{knn_mat_method} is used to first convert \code{mat} to a knn object,
-then \code{knn_graph_method} is used to convert the knn object to an adjacency matrix.}
+then \code{knn_graph_method} is used to convert the knn object to a graph adjacency matrix.}
 }
 \value{
 The converted matrix.
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index 730aa1a8..1e640ff0 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -31,11 +31,11 @@ select_features_binned_dispersion(
 )
 }
 \arguments{
-\item{mat}{(IterableMatrix) dimensions features x cells}
+\item{mat}{(IterableMatrix) Counts matrix with dimensions \verb{(features x cells)}.}
 
 \item{num_feats}{(float) Number of features to mark as highly_variable. If 0 < num_feats < 1, then interpret it as a fraction of features.}
 
-\item{normalize_method}{(function) Normalize the matrix prior to feature selection by calling normalize(mat) if it's not NULL.
+\item{normalize_method}{(function) Used to normalize the matrix prior to feature selection by calling \code{normalize_method(mat)} if it is not NULL.
 For example, pass normalize_log() or normalize_tfidf().
 If the normalize function accepts a threads argument, that will passed as normalize(mat, threads=threads).}
 
@@ -71,11 +71,14 @@ each feature \eqn{x_i} as follows:
 }
 }
 \description{
-Apply a feature selection method to a non-normalized \verb{(features x cells)} matrix.  We recommend using counts matrices as input and
-apply any normalizations prior to feature selection via the normalize argument (if available).  Instead of directly subsetting the input matrix,
-an output dataframe is provided indicating which features are highly variable, and the scoring of each feature.
+Apply a feature selection method to a non-normalized \verb{(features x cells)} matrix.
 }
 \details{
+We recommend using counts matrices as input and to
+apply any normalizations prior to feature selection via the normalize argument (if available).
+Instead of directly subsetting the input matrix,
+an output dataframe is returned, indicating which features are highly variable, and the scoring of each feature.
+
 \code{select_features_binned_dispersion} implements the approach from Satija et al. 2015:
 \enumerate{
 \item Bin features into equal-width bins by \code{log1p(mean)}
diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd
index 22e9218c..5ca3f814 100644
--- a/r/man/normalize.Rd
+++ b/r/man/normalize.Rd
@@ -10,7 +10,7 @@ normalize_log(mat, scale_factor = 10000, threads = 1L)
 normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L)
 }
 \arguments{
-\item{mat}{(IterableMatrix) Counts matrix to normalize. \verb{(features x cells)}}
+\item{mat}{(IterableMatrix) Counts matrix with dimensions \verb{(features x cells)}.}
 
 \item{scale_factor}{(numeric) Scaling factor to multiply matrix by prior to normalization (see formulas below).}
 

From 17d9d96fecdb3c30f3a441feef47ebc9b2c03240 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Sun, 23 Feb 2025 23:27:27 -0800
Subject: [PATCH 109/142] [r] add `RcppAnnoy` dependency check and requirement

---
 r/DESCRIPTION    | 3 ++-
 r/R/clustering.R | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index f8fab3f6..1f5a3842 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -49,7 +49,8 @@ Suggests:
     GenomicRanges,
     matrixStats,
     igraph,
-    RcppHNSW
+    RcppHNSW,
+    RcppAnnoy
 Depends: 
     R (>= 3.5.0)
 Config/Needs/website: pkgdown, devtools, uwot, irlba, RcppHNSW, igraph, BiocManager, bioc::BSgenome.Hsapiens.UCSC.hg38, github::GreenleafLab/motifmatchr, github::GreenleafLab/chromVARmotifs
diff --git a/r/R/clustering.R b/r/R/clustering.R
index 587574b8..af57b43c 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -356,6 +356,7 @@ knn_hnsw <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine
   metric <- match.arg(metric)
   assert_is(verbose, "logical")
   assert_is_wholenumber(threads)
+  assert_has_package("RcppHNSW")
   if (rlang::is_missing(data)) return(create_partial())
   index <- RcppHNSW::hnsw_build(
     data,
@@ -392,6 +393,7 @@ knn_hnsw <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine
 #' @export
 knn_annoy <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine", "manhattan", "hamming"), n_trees = 50, search_k = -1) {
   metric <- match.arg(metric)
+  assert_has_package("RcppAnnoy")
   if (rlang::is_missing(data)) return(create_partial())
   if (is.null(query)) query <- data
   annoy <- switch(metric,

From f8f4c04f3248381310326ec2b24dc90af224dea2 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Sun, 23 Feb 2025 23:30:17 -0800
Subject: [PATCH 110/142] [r] clean up feature selection docs

---
 r/R/singlecell_utils.R     | 4 +---
 r/man/feature_selection.Rd | 8 +++-----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 9d176da5..bbfd5d83 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -13,9 +13,7 @@
 
 #' Feature selection functions
 #' 
-#' Apply a feature selection method to a non-normalized `(features x cells)` matrix. 
-#' 
-#'  We recommend using counts matrices as input and to
+#' Apply a feature selection method to a non-normalized `(features x cells)` matrix.  We recommend using counts matrices as input and to
 #' apply any normalizations prior to feature selection via the normalize argument (if available).  
 #' Instead of directly subsetting the input matrix,
 #' an output dataframe is returned, indicating which features are highly variable, and the scoring of each feature.
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index 1e640ff0..045b6f3e 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -71,14 +71,12 @@ each feature \eqn{x_i} as follows:
 }
 }
 \description{
-Apply a feature selection method to a non-normalized \verb{(features x cells)} matrix.
-}
-\details{
-We recommend using counts matrices as input and to
+Apply a feature selection method to a non-normalized \verb{(features x cells)} matrix.  We recommend using counts matrices as input and to
 apply any normalizations prior to feature selection via the normalize argument (if available).
 Instead of directly subsetting the input matrix,
 an output dataframe is returned, indicating which features are highly variable, and the scoring of each feature.
-
+}
+\details{
 \code{select_features_binned_dispersion} implements the approach from Satija et al. 2015:
 \enumerate{
 \item Bin features into equal-width bins by \code{log1p(mean)}

From 810adac7b0b2598d44bd1d167ed496bda2fc8cb9 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 24 Feb 2025 00:53:20 -0800
Subject: [PATCH 111/142] [r] more docs cleanups

---
 r/R/singlecell_utils.R     | 16 ++++++++--------
 r/man/IterativeLSI.Rd      | 12 ++++++------
 r/man/feature_selection.Rd |  7 ++++++-
 3 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index bbfd5d83..62870486 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -117,7 +117,7 @@ select_features_dispersion <- function(
 #' @returns
 #' - `select_features_mean`: \eqn{\mathrm{Score}(x_i) = \frac{\sum_{j=1}^{n}\bigl(x_{ij}\bigr)}{n}}
 #' @export
-select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, threads = 1L) {
+select_features_mean <- function(mat, num_feats = 0.05, normalize_method = NULL, threads = 1L) {
   assert_greater_than_zero(num_feats)
   assert_is_wholenumber(threads)
   if (rlang::is_missing(mat)) return(create_partial())
@@ -127,7 +127,7 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread
     rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat))))
   }
   num_feats <- min(max(num_feats, 0), nrow(mat))
-  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads, .missing_args_error = FALSE)(mat)
+  if (!is.null(normalize_method)) mat <- partial_apply(normalize_method, threads = threads, .missing_args_error = FALSE)(mat)
   # get the sum of each feature, binarized
   features_df <- tibble::tibble(
     feature = rownames(mat),
@@ -418,15 +418,15 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #'    - Else, cluster the LSI results using `cluster_method`
 #' 
 #' There are some minor differences when compared to the ArchR implementation:
-#' - The ArchR implementation uses a different method for selecting features in the first iteration.  The default method is `select_features_variance`, which is the same as the ArchR implementation.
+#' - ArchR uses a different method for selecting features in the first iteration.  The default method is `select_features_variance`, which is the same as the ArchR implementation.
 #' `select_features_mean(normalize_method = binarize)` can be passed in for the `feature_selection_method` argument to mimic the ArchR implementation, if choosing to only run one iteration.
-#' Firstly, the ArchR implementation uses a different method for selecting features in the first iteration.  This function currently does not support utilization of different feature selection methods across each iteration.  
+#' - `IterativeLSI()` currently does not support utilization of different feature selection methods across each iteration.  
 #' If one desires to use a different feature selection method for each iteration, they can take the cluster assignments from the previous iteration and use them to select features and run LSI.
-#' - The ArchR implementation calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
-#' which BPCells does not encounter even with a non-subsetted matrix. Therefore, IterativeLSI will run LSI on the entire matrix for each iteraiton.
-#' - The ArchR implementation defaults on using Seurat clustering for default, which utilizes the Louvain algorithm (See `Seurat::FindClusters()`).  In constrast, `IterativeLSI()` utilizes
+#' - ArchR calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
+#' which BPCells does not encounter even with a non-subsetted matrix. Therefore, IterativeLSI will run LSI on the entire matrix for each iteration.
+#' - ArchR defaults on using Seurat clustering for default, which utilizes the Louvain algorithm (See `Seurat::FindClusters()`).  In constrast, `IterativeLSI()` utilizes
 #' leiden, which should provide the same clustering results while being faster.
-#' - The ArchR implementation also plots a umap of every iteration's dimensionality reduction.  While this is not implemented in `IterativeLSI()`,
+#' - ArchR also plots a umap of every iteration's dimensionality reduction.  While this is not implemented in `IterativeLSI()`,
 #' one can use the `project()` method with the `iteration` argument set to the desired iteration to get projected data.  This can then be fed into `uwot::umap()`
 #' 
 #' @seealso `LSI()` `DimReduction()` `svd()` `knn_hnsw()` `knn_annoy()` 
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index aad46eeb..06fa66a3 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -94,15 +94,15 @@ The Iterative LSI method is as follows:
 
 There are some minor differences when compared to the ArchR implementation:
 \itemize{
-\item The ArchR implementation uses a different method for selecting features in the first iteration.  The default method is \code{select_features_variance}, which is the same as the ArchR implementation.
+\item ArchR uses a different method for selecting features in the first iteration.  The default method is \code{select_features_variance}, which is the same as the ArchR implementation.
 \code{select_features_mean(normalize_method = binarize)} can be passed in for the \code{feature_selection_method} argument to mimic the ArchR implementation, if choosing to only run one iteration.
-Firstly, the ArchR implementation uses a different method for selecting features in the first iteration.  This function currently does not support utilization of different feature selection methods across each iteration.
+\item \code{IterativeLSI()} currently does not support utilization of different feature selection methods across each iteration.
 If one desires to use a different feature selection method for each iteration, they can take the cluster assignments from the previous iteration and use them to select features and run LSI.
-\item The ArchR implementation calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
-which BPCells does not encounter even with a non-subsetted matrix. Therefore, IterativeLSI will run LSI on the entire matrix for each iteraiton.
-\item The ArchR implementation defaults on using Seurat clustering for default, which utilizes the Louvain algorithm (See \code{Seurat::FindClusters()}).  In constrast, \code{IterativeLSI()} utilizes
+\item ArchR calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
+which BPCells does not encounter even with a non-subsetted matrix. Therefore, IterativeLSI will run LSI on the entire matrix for each iteration.
+\item ArchR defaults on using Seurat clustering for default, which utilizes the Louvain algorithm (See \code{Seurat::FindClusters()}).  In constrast, \code{IterativeLSI()} utilizes
 leiden, which should provide the same clustering results while being faster.
-\item The ArchR implementation also plots a umap of every iteration's dimensionality reduction.  While this is not implemented in \code{IterativeLSI()},
+\item ArchR also plots a umap of every iteration's dimensionality reduction.  While this is not implemented in \code{IterativeLSI()},
 one can use the \code{project()} method with the \code{iteration} argument set to the desired iteration to get projected data.  This can then be fed into \code{uwot::umap()}
 }
 }
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index 045b6f3e..ccc4c50b 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -21,7 +21,12 @@ select_features_dispersion(
   threads = 1L
 )
 
-select_features_mean(mat, num_feats = 0.05, normalize = NULL, threads = 1L)
+select_features_mean(
+  mat,
+  num_feats = 0.05,
+  normalize_method = NULL,
+  threads = 1L
+)
 
 select_features_binned_dispersion(
   mat,

From 4af4e89e2e882126bbe0e5d195da16a40308aea8 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Tue, 25 Feb 2025 00:16:41 -0800
Subject: [PATCH 112/142] [r] add requested pr changes

---
 r/NEWS.md                                |  7 ++++-
 r/R/clustering.R                         | 38 ++++++++++++------------
 r/R/singlecell_utils.R                   | 27 +++++++----------
 r/man/IterativeLSI.Rd                    |  6 ++--
 r/man/knn_graph.Rd                       |  8 ++---
 r/tests/testthat/test-singlecell_utils.R | 10 +++----
 6 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/r/NEWS.md b/r/NEWS.md
index d966de80..f50361da 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -8,6 +8,10 @@ Contributions welcome :)
 
 # BPCells 0.3.1 (in-progress main branch)
 
+## Breaking changes
+- Change first parameter name of `cluster_graph_leiden()`, `cluster_graph_louvain()` and `cluster_graph_seurat()` from `snn` to `mat` to more accurately reflect the input type.  (pull request #189)
+- Added non-final parameters, `knn_obj_method` and `knn_graph_method` to `cluster_graph_leiden()`, `cluster_graph_louvain()` and `cluster_graph_seurat()`. (pull request #189)
+
 ## Features
 - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166)
 - Add normalization helper functions `normalize_log()` and `normalize_tfidf()` (pull request #168)
@@ -15,12 +19,13 @@ Contributions welcome :)
 - Add feature selection functions `select_features_variance()`, and `select_features_{dispersion,mean,binned_dispersion}()`, with parameterization for normalization steps, and number of variable features (pull request #189)
 - Add `LSI()` and `IterativeLSI()` dimensionality functions to perform latent semantic indexing on a matrix (pull request #189).
 - Add capability to create partial function objects in when excluding the first argument of a function.  This is implemented in normalizations, feature selections, dimensionality reductions, and clustering functions. See `select_features_variance()` for usage.  (pull request #189)
+- Allowed clustering functions `cluster_graph_leiden()`, `cluster_graph_louvain()`, and `cluster_graph_seurat()` to also perform knn object and graph adjacency construction intermediate steps with `knn_obj_method` and `knn_graph_method` parameters. Also provided
+`threads` and `verbose` arguments to clustering functions that are automatically passed down to knn object/graph adjacency construction steps. (pull request #189)
 
 ## Improvements
 - Speed up taking large subsets of large concatenated matrices, e.g. selecting 9M cells from a 10M cell matrix composed of ~100 concatenated pieces. (pull request #179)
 - `matrix_stats()` now also works with types `matrix` and `dgCMatrix`. (pull request #190)
 - Fixed memory errors when running `writeInsertionBed()` and `writeInsertionBedGraph()` (pull request #{118, 134})
-- Change parameter name of `cluster_graph_leiden()`, `cluster_graph_louvain()` and `cluster_graph_seurat()` from `snn` to `mat` to more accurately reflect the input type.  (pull request #189)
 
 ## Bug-fixes
 - Fix error message printing when MACS crashes during `call_peaks_macs()` (pull request #175)
diff --git a/r/R/clustering.R b/r/R/clustering.R
index af57b43c..edb01c7b 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -36,7 +36,7 @@ is_adjacency_matrix <- function(mat) {
 #'
 #' @param mat Input matrix to be converted.
 #' @param required_mat_type (character) Required matrix type: "adjacency" or "knn".
-#' @param knn_mat_method (function) Function to convert input to a knn object (e.g., `knn_hnsw`, `knn_annoy`).
+#' @param knn_obj_method (function) Function to convert input to a knn object (e.g., `knn_hnsw`, `knn_annoy`).
 #'   Ignored if `mat` is already a knn object or graph matrix.
 #' @param knn_graph_method (function) Function to convert a knn object to a graph adjacency matrix
 #'   (e.g., `knn_to_geodesic_graph`). Ignored if `required_mat_type` is "knn" or if
@@ -44,22 +44,22 @@ is_adjacency_matrix <- function(mat) {
 #' #' @details
 #' This function checks the type of the input matrix `mat`. `mat` is returned without modification if
 #' it is already the required type (adjacency or knn).
-#' If `mat` is a standard matrix and `required_mat_type` is "knn", then `knn_mat_method` is used to convert `mat` to a knn object.
-#' If `required_mat_type` is "adjacency", then `knn_mat_method` is used to first convert `mat` to a knn object, 
+#' If `mat` is a standard matrix and `required_mat_type` is "knn", then `knn_obj_method` is used to convert `mat` to a knn object.
+#' If `required_mat_type` is "adjacency", then `knn_obj_method` is used to first convert `mat` to a knn object, 
 #' then `knn_graph_method` is used to convert the knn object to a graph adjacency matrix.
 #' @return The converted matrix.
 #' @keywords internal
 convert_mat_to_cluster_matrix <- function(
   mat,
   required_mat_type = c("knn", "adjacency"),
-  knn_mat_method = knn_hnsw,
+  knn_obj_method = knn_hnsw,
   knn_graph_method = knn_to_geodesic_graph,
   threads = 1L,
   verbose = FALSE
 ) {
   required_mat_type <- match.arg(required_mat_type)
   if (is(mat, "matrix")) {
-    mat <- partial_apply(knn_mat_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
+    mat <- partial_apply(knn_obj_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
   }
   if (required_mat_type == "knn" && !is_knn_object(mat)) {
     pretty_error(mat, "must be a knn object, or convertible to one", 1)
@@ -86,20 +86,20 @@ convert_mat_to_cluster_matrix <- function(
 #'  dist for cell x K neighbor distances
 #' @param use_weights boolean for whether to replace all distance weights with 1
 #' @param self_loops boolean for whether to allow cells to count themselves as neighbors
-#' @param knn_mat_method (function) if knn is not a knn object, this function will attempt to convert it to one. 
+#' @param knn_obj_method (function) if knn is not a knn object, this function will attempt to convert it to one. 
 #' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn object.
 #' @param threads (integer) Number of threads to use.
 #' @param verbose (logical) Whether to print progress information during search
 #' @return **knn_to_graph**
 #'   Sparse matrix (dgCMatrix) where `mat[i,j]` = distance from cell `i` to
 #'   cell `j`, or 0 if cell `j` is not in the K nearest neighbors of `i`
-knn_to_graph <- function(knn, use_weights = FALSE, knn_mat_method = knn_hnsw, self_loops = TRUE, threads = 0L, verbose = FALSE) {
+knn_to_graph <- function(knn, use_weights = FALSE, knn_obj_method = knn_hnsw, self_loops = TRUE, threads = 0L, verbose = FALSE) {
   assert_is(use_weights, "logical")
   assert_is(self_loops, "logical")
   assert_is_wholenumber(threads)
   assert_is(verbose, "logical")
   if (rlang::is_missing(knn)) return(create_partial())
-  mat <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_mat_method = knn_mat_method, threads = threads, verbose = verbose)
+  mat <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_obj_method = knn_obj_method, threads = threads, verbose = verbose)
   if (use_weights) {
     weights <- knn$dist
   } else {
@@ -138,13 +138,13 @@ knn_to_graph <- function(knn, use_weights = FALSE, knn_mat_method = knn_hnsw, se
 #'      These correspond to the rows, cols, and values of non-zero entries in the lower triangle
 #'      adjacency matrix. `dim` is the total number of vertices (cells) in the graph 
 #' @export
-knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, knn_mat_method = knn_hnsw, return_type=c("matrix", "list"), threads = 0L, verbose = FALSE) {
+knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, knn_obj_method = knn_hnsw, return_type=c("matrix", "list"), threads = 0L, verbose = FALSE) {
   return_type <- match.arg(return_type)
   assert_is(self_loops, "logical")
   assert_is_wholenumber(threads)
   assert_is(verbose, "logical")
   if (rlang::is_missing(knn)) return(create_partial())
-  knn <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_mat_method = knn_mat_method, threads = threads, verbose = verbose)
+  knn <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_obj_method = knn_obj_method, threads = threads, verbose = verbose)
   # Solve x / (2*K - x) >= min_val --> x >= 2*K*min_val / (1 + min_val)
   min_int <- ceiling(2*min_val*ncol(knn$idx) / (1 + min_val))
   snn <- build_snn_graph_cpp(knn$idx, min_neighbors = min_int)
@@ -193,11 +193,11 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, knn_mat_
 #'      These correspond to the rows, cols, and values of non-zero entries in the lower triangle
 #'      adjacency matrix. `dim` is the total number of vertices (cells) in the graph 
 #' @export
-knn_to_geodesic_graph <- function(knn, knn_mat_method = knn_hnsw, return_type = c("matrix", "list"), threads = 0L, verbose = FALSE) {
+knn_to_geodesic_graph <- function(knn, knn_obj_method = knn_hnsw, return_type = c("matrix", "list"), threads = 0L, verbose = FALSE) {
   return_type <- match.arg(return_type)
   assert_is_wholenumber(threads)
   if (rlang::is_missing(knn)) return(create_partial())
-  knn <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_mat_method = knn_mat_method, threads = threads, verbose = verbose)
+  knn <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_obj_method = knn_obj_method, threads = threads, verbose = verbose)
   graph <- build_umap_graph_cpp(knn$dist, knn$idx, threads=threads)
   
   graph$dim <- nrow(knn$idx)
@@ -221,7 +221,7 @@ knn_to_geodesic_graph <- function(knn, knn_mat_method = knn_hnsw, return_type =
 #' @param resolution Resolution parameter. Higher values result in more clusters
 #' @param objective_function Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below). 
 #'    For the meaning of each option, see `igraph::cluster_leiden()`.
-#' @param knn_mat_method (function) if mat represents a regular non-knn object, this function will attempt to convert it a knn object.
+#' @param knn_obj_method (function) if mat represents a regular non-knn object, this function will attempt to convert it a knn object.
 #' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn object.
 #' @param knn_graph_method (function) if mat represents a knn object, this function will attempt to convert it to a graph matrix.
 #' Must be a (optionally partialized) version of `knn_to_graph()`, `knn_to_snn_graph()` or `knn_to_geodesic_graph()`.  
@@ -234,13 +234,13 @@ knn_to_geodesic_graph <- function(knn, knn_mat_method = knn_hnsw, return_type =
 #' @export
 cluster_graph_leiden <- function(
   mat, resolution = 1, objective_function = c("modularity", "CPM"), 
-  knn_mat_method = knn_hnsw, knn_graph_method = knn_to_geodesic_graph, seed = 12531, threads = 0L, verbose = FALSE, ...
+  knn_obj_method = knn_hnsw, knn_graph_method = knn_to_geodesic_graph, seed = 12531, threads = 0L, verbose = FALSE, ...
 ) {
   assert_has_package("igraph")
   # Set seed without permanently changing seed state
   if (rlang::is_missing(mat)) return(create_partial())
   mat <- convert_mat_to_cluster_matrix(
-    mat, required_mat_type = "adjacency", knn_mat_method = knn_mat_method,
+    mat, required_mat_type = "adjacency", knn_obj_method = knn_obj_method,
     knn_graph_method = knn_graph_method, threads = threads, verbose = verbose
   )
   prev_seed <- get_seed()
@@ -260,14 +260,14 @@ cluster_graph_leiden <- function(
 #' @details **cluster_graph_louvain**: Louvain graph clustering algorithm `igraph::cluster_louvain()`
 #' @export
 cluster_graph_louvain <- function(
-  mat, resolution = 1, knn_mat_method = knn_hnsw, 
+  mat, resolution = 1, knn_obj_method = knn_hnsw, 
   knn_graph_method = knn_to_geodesic_graph, seed = 12531, threads = 0L, verbose = FALSE
 ) {
   assert_has_package("igraph")
   # Set seed without permanently changing seed state
   if (rlang::is_missing(mat)) return(create_partial())
   mat <- convert_mat_to_cluster_matrix(
-    mat, required_mat_type = "adjacency", knn_mat_method = knn_mat_method,
+    mat, required_mat_type = "adjacency", knn_obj_method = knn_obj_method,
     knn_graph_method = knn_graph_method, threads = threads, verbose = verbose
   )
 
@@ -285,13 +285,13 @@ cluster_graph_louvain <- function(
 #' @details **cluster_graph_seurat**: Seurat's clustering algorithm `Seurat::FindClusters()`
 #' @export
 cluster_graph_seurat <- function(
-  mat, resolution = 0.8, knn_mat_method = knn_hnsw, 
+  mat, resolution = 0.8, knn_obj_method = knn_hnsw, 
   knn_graph_method = knn_to_geodesic_graph, threads = 0L, verbose = FALSE, ...
 ) {
   assert_has_package("Seurat")
   if (rlang::is_missing(mat)) return(create_partial())
   mat <- convert_mat_to_cluster_matrix(
-    mat, required_mat_type = "adjacency", knn_mat_method = knn_mat_method,
+    mat, required_mat_type = "adjacency", knn_obj_method = knn_obj_method,
     knn_graph_method = knn_graph_method, threads = threads, verbose = verbose
   )
   Seurat::as.Graph(mat) %>%
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 62870486..b753f370 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -356,21 +356,16 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
     assert_true(all(x$feature_names %in% rownames(mat)))
     mat <- mat[x$feature_names, ]
   }
-  mat <- partial_apply(
-    normalize_tfidf,
+  mat <- normalize_tfidf(
+    mat,
     feature_means = fitted_params$feature_means,
     scale_factor = fitted_params$scale_factor,
-    threads = threads,
-    .missing_args_error = FALSE
-  )(mat)
-  mat <- write_matrix_dir(
-    convert_matrix_type(mat, type = "float"),
-    tempfile("mat"), compress = TRUE
+    threads = threads
   )
   feature_loadings <- fitted_params$feature_loadings
   res <- t(mat) %*% feature_loadings
   if (length(fitted_params$pcs_to_keep) != ncol(res)) {
-    res <- res[, fitted_params$pcs_to_keep] %>% as.matrix()
+    res <- res[, fitted_params$pcs_to_keep, drop = FALSE]
   }
   return(res)
 }
@@ -389,7 +384,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @param cluster_method (function) Method to use for clustering the post-SVD matrix. 
 #' Current builtin options are `cluster_graph_{leiden, louvain, seurat}()`.
 #' The user can pass in partial parameters to the cluster method, such as by passing 
-#' `cluster_graph_leiden(resolution = 0.5, knn_mat_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))`.
+#' `cluster_graph_leiden(resolution = 0.5, knn_obj_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))`.
 #' @return 
 #' `IterativeLSI()` An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data as a matrix of shape `(cells, n_dimensions)`
@@ -428,8 +423,9 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' leiden, which should provide the same clustering results while being faster.
 #' - ArchR also plots a umap of every iteration's dimensionality reduction.  While this is not implemented in `IterativeLSI()`,
 #' one can use the `project()` method with the `iteration` argument set to the desired iteration to get projected data.  This can then be fed into `uwot::umap()`
-#' 
-#' @seealso `LSI()` `DimReduction()` `svd()` `knn_hnsw()` `knn_annoy()` 
+#' - ArchR by default filters out PCs with a correlation to sequencing depth greater than 0.75. 
+#' While corr_cutoff is provided as an argument in `IterativeLSI()`, it is set to not removing any PCs by default.
+#' @seealso `LSI()` `DimReduction()` `svds()` `knn_hnsw()` `knn_annoy()` 
 #' `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` `select_features_variance()` `select_features_dispersion()` 
 #' `select_features_mean()` `select_features_binned_dispersion()`
 #' @inheritParams LSI
@@ -488,6 +484,7 @@ IterativeLSI <- function(
     lsi_res_obj <- LSI(
       mat = mat[mat_indices, ],
       n_dimensions = n_dimensions,
+      corr_cutoff = corr_cutoff,
       scale_factor = scale_factor,
       threads = threads,
       verbose = verbose
@@ -557,15 +554,11 @@ project.IterativeLSI <- function(x, mat, iteration = x$fitted_params$iterations,
     scale_factor = lsi_attr$scale_factor,
     threads = threads
   )
-  mat <- write_matrix_dir(
-    convert_matrix_type(mat, type = "float"),
-    tempfile("mat"), compress = TRUE
-  )
 
   feature_loadings <- iter_info$feature_loadings[[1]]
   res <-  t(mat) %*% feature_loadings
   if (length(iter_info$pcs_to_keep[[1]]) != ncol(res)) {
-    res <- res[, iter_info$pcs_to_keep[[1]]]
+    res <- res[, iter_info$pcs_to_keep[[1]], drop = FALSE]
   }
   return(res)
 }
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index 06fa66a3..13f3ef31 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -37,7 +37,7 @@ the corr_cutoff, it will be excluded from the final PCA matrix.}
 \item{cluster_method}{(function) Method to use for clustering the post-SVD matrix.
 Current builtin options are \verb{cluster_graph_\{leiden, louvain, seurat\}()}.
 The user can pass in partial parameters to the cluster method, such as by passing
-\code{cluster_graph_leiden(resolution = 0.5, knn_mat_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))}.}
+\code{cluster_graph_leiden(resolution = 0.5, knn_obj_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))}.}
 
 \item{threads}{(integer) Number of threads to use.}
 
@@ -104,10 +104,12 @@ which BPCells does not encounter even with a non-subsetted matrix. Therefore, It
 leiden, which should provide the same clustering results while being faster.
 \item ArchR also plots a umap of every iteration's dimensionality reduction.  While this is not implemented in \code{IterativeLSI()},
 one can use the \code{project()} method with the \code{iteration} argument set to the desired iteration to get projected data.  This can then be fed into \code{uwot::umap()}
+\item ArchR by default filters out PCs with a correlation to sequencing depth greater than 0.75.
+While corr_cutoff is provided as an argument in \code{IterativeLSI()}, it is set to not removing any PCs by default.
 }
 }
 \seealso{
-\code{LSI()} \code{DimReduction()} \code{svd()} \code{knn_hnsw()} \code{knn_annoy()}
+\code{LSI()} \code{DimReduction()} \code{svds()} \code{knn_hnsw()} \code{knn_annoy()}
 \code{cluster_graph_leiden()} \code{cluster_graph_louvain()} \code{cluster_graph_seurat()} \code{select_features_variance()} \code{select_features_dispersion()}
 \code{select_features_mean()} \code{select_features_binned_dispersion()}
 }
diff --git a/r/man/knn_graph.Rd b/r/man/knn_graph.Rd
index bd5c031f..32a3b689 100644
--- a/r/man/knn_graph.Rd
+++ b/r/man/knn_graph.Rd
@@ -9,7 +9,7 @@
 knn_to_graph(
   knn,
   use_weights = FALSE,
-  knn_mat_method = knn_hnsw,
+  knn_obj_method = knn_hnsw,
   self_loops = TRUE,
   threads = 0L,
   verbose = FALSE
@@ -19,7 +19,7 @@ knn_to_snn_graph(
   knn,
   min_val = 1/15,
   self_loops = FALSE,
-  knn_mat_method = knn_hnsw,
+  knn_obj_method = knn_hnsw,
   return_type = c("matrix", "list"),
   threads = 0L,
   verbose = FALSE
@@ -27,7 +27,7 @@ knn_to_snn_graph(
 
 knn_to_geodesic_graph(
   knn,
-  knn_mat_method = knn_hnsw,
+  knn_obj_method = knn_hnsw,
   return_type = c("matrix", "list"),
   threads = 0L,
   verbose = FALSE
@@ -39,7 +39,7 @@ dist for cell x K neighbor distances}
 
 \item{use_weights}{boolean for whether to replace all distance weights with 1}
 
-\item{knn_mat_method}{(function) if knn is not a knn object, this function will attempt to convert it to one.
+\item{knn_obj_method}{(function) if knn is not a knn object, this function will attempt to convert it to one.
 Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn object.}
 
 \item{self_loops}{Whether to allow self-loops in the output graph}
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index dc46e392..3ec08e93 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -236,7 +236,7 @@ test_that("LSI works", {
     expect_equal(nrow(lsi_res_t), nrow(mat))
     expect_equal(nrow(lsi_res_proj), ncol(mat))
     expect_lt(ncol(lsi_res_obj_corr$cell_embeddings), n_dimensions)
-    expect_equal(lsi_res, lsi_res_proj)
+    expect_equal(lsi_res, lsi_res_proj, tolerance = 1e-7)
 })
 
 test_that("Iterative LSI works", {
@@ -244,7 +244,7 @@ test_that("Iterative LSI works", {
   mat <- matrix(data = runif(50000, 0, 1), nrow=500, ncol = 100) %>% as("dgCMatrix") %>% as("IterableMatrix")
   rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
   colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
-  lsi_res_obj <- expect_no_error(IterativeLSI(mat, n_iterations = 2, n_dimensions = 10L, cluster_method = cluster_graph_louvain(knn_mat_method = knn_hnsw)))
+  lsi_res_obj <- expect_no_error(IterativeLSI(mat, n_iterations = 2, n_dimensions = 10L, cluster_method = cluster_graph_louvain(knn_obj_method = knn_hnsw)))
   lsi_res_proj <- project(lsi_res_obj, mat)
   lsi_res_proj_iter_1 <- expect_no_error(project(lsi_res_obj, mat, iteration = 1L))
   lsi_res_embedding <- lsi_res_obj$cell_embeddings
@@ -252,7 +252,7 @@ test_that("Iterative LSI works", {
   expect_equal(ncol(lsi_res_embedding), 10)
   expect_equal(nrow(lsi_res_proj_iter_1), ncol(mat))
   expect_equal(ncol(lsi_res_proj_iter_1), 10)
-  expect_equal(lsi_res_embedding, lsi_res_proj)
+  expect_equal(lsi_res_embedding, lsi_res_proj, tolerance = 1e-7)
 })
 
 test_that("Iterative LSI works with parameterized clustering", {
@@ -264,7 +264,7 @@ test_that("Iterative LSI works with parameterized clustering", {
     IterativeLSI(
       mat, n_dimensions = 10L,,
       cluster_method = cluster_graph_leiden(
-        knn_mat_method = knn_annoy(k = 12),
+        knn_obj_method = knn_annoy(k = 12),
         knn_graph_method = knn_to_snn_graph(min_val = 0.1)
       )
     )
@@ -273,5 +273,5 @@ test_that("Iterative LSI works with parameterized clustering", {
   lsi_res_embedding <- lsi_res_obj$cell_embeddings
   expect_equal(nrow(lsi_res_embedding), ncol(mat))
   expect_equal(ncol(lsi_res_embedding), 10)
-  expect_equal(lsi_res_embedding, lsi_res_proj)
+  expect_equal(lsi_res_embedding, lsi_res_proj, tolerance = 1e7)
 })
\ No newline at end of file

From 26597c8969f560ff0d611462e59b73ae9641b43b Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Tue, 25 Feb 2025 13:09:56 -0800
Subject: [PATCH 113/142] [r] redo normalize examples for clarity

---
 r/R/transforms.R   | 9 +++++++--
 r/man/cluster.Rd   | 8 ++++----
 r/man/normalize.Rd | 9 +++++++--
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/r/R/transforms.R b/r/R/transforms.R
index 43342e98..65fbdaac 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -967,10 +967,15 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #' set.seed(12345)
 #' mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
 #' mat
+#' 
 #' mat <- as(mat, "IterableMatrix")
 #' normalize_log(mat)
-#' # normalize functions can also be called with partial arguments 
-#' normalize_log(scale_factor = 1e5)(mat)
+#' 
+#' # normalization functions can also be called with partial arguments 
+#' partial_log <- normalize_log(scale_factor = 1e5)
+#' partial_log
+#' 
+#' partial_log(mat)
 #' @export
 normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
   assert_greater_than_zero(scale_factor)
diff --git a/r/man/cluster.Rd b/r/man/cluster.Rd
index beea9c6c..793df08f 100644
--- a/r/man/cluster.Rd
+++ b/r/man/cluster.Rd
@@ -10,7 +10,7 @@ cluster_graph_leiden(
   mat,
   resolution = 1,
   objective_function = c("modularity", "CPM"),
-  knn_mat_method = knn_hnsw,
+  knn_obj_method = knn_hnsw,
   knn_graph_method = knn_to_geodesic_graph,
   seed = 12531,
   threads = 0L,
@@ -21,7 +21,7 @@ cluster_graph_leiden(
 cluster_graph_louvain(
   mat,
   resolution = 1,
-  knn_mat_method = knn_hnsw,
+  knn_obj_method = knn_hnsw,
   knn_graph_method = knn_to_geodesic_graph,
   seed = 12531,
   threads = 0L,
@@ -31,7 +31,7 @@ cluster_graph_louvain(
 cluster_graph_seurat(
   mat,
   resolution = 0.8,
-  knn_mat_method = knn_hnsw,
+  knn_obj_method = knn_hnsw,
   knn_graph_method = knn_to_geodesic_graph,
   threads = 0L,
   verbose = FALSE,
@@ -46,7 +46,7 @@ cluster_graph_seurat(
 \item{objective_function}{Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below).
 For the meaning of each option, see \code{igraph::cluster_leiden()}.}
 
-\item{knn_mat_method}{(function) if mat represents a regular non-knn object, this function will attempt to convert it a knn object.
+\item{knn_obj_method}{(function) if mat represents a regular non-knn object, this function will attempt to convert it a knn object.
 Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn object.}
 
 \item{knn_graph_method}{(function) if mat represents a knn object, this function will attempt to convert it to a graph matrix.
diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd
index 5ca3f814..db33c58d 100644
--- a/r/man/normalize.Rd
+++ b/r/man/normalize.Rd
@@ -53,9 +53,14 @@ This can be used to customize \code{normalize} parameters in other single cell f
 set.seed(12345)
 mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
 mat
+
 mat <- as(mat, "IterableMatrix")
 normalize_log(mat)
-# normalize functions can also be called with partial arguments 
-normalize_log(scale_factor = 1e5)(mat)
+
+# normalization functions can also be called with partial arguments 
+partial_log <- normalize_log(scale_factor = 1e5)
+partial_log
+
+partial_log(mat)
 normalize_tfidf(mat)
 }

From 6437793de284dd24ca534d639d74f0e2bef085b3 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 26 Feb 2025 16:47:30 -0800
Subject: [PATCH 114/142] [r] add inn missing rd for
 `convert_mat_to_cluster_matrix()`

---
 r/man/convert_mat_to_cluster_matrix.Rd | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/r/man/convert_mat_to_cluster_matrix.Rd b/r/man/convert_mat_to_cluster_matrix.Rd
index c00d7bdd..2f18af38 100644
--- a/r/man/convert_mat_to_cluster_matrix.Rd
+++ b/r/man/convert_mat_to_cluster_matrix.Rd
@@ -7,7 +7,7 @@
 convert_mat_to_cluster_matrix(
   mat,
   required_mat_type = c("knn", "adjacency"),
-  knn_mat_method = knn_hnsw,
+  knn_obj_method = knn_hnsw,
   knn_graph_method = knn_to_geodesic_graph,
   threads = 1L,
   verbose = FALSE
@@ -18,7 +18,7 @@ convert_mat_to_cluster_matrix(
 
 \item{required_mat_type}{(character) Required matrix type: "adjacency" or "knn".}
 
-\item{knn_mat_method}{(function) Function to convert input to a knn object (e.g., \code{knn_hnsw}, \code{knn_annoy}).
+\item{knn_obj_method}{(function) Function to convert input to a knn object (e.g., \code{knn_hnsw}, \code{knn_annoy}).
 Ignored if \code{mat} is already a knn object or graph matrix.}
 
 \item{knn_graph_method}{(function) Function to convert a knn object to a graph adjacency matrix
@@ -27,8 +27,8 @@ Ignored if \code{mat} is already a knn object or graph matrix.}
 #' @details
 This function checks the type of the input matrix \code{mat}. \code{mat} is returned without modification if
 it is already the required type (adjacency or knn).
-If \code{mat} is a standard matrix and \code{required_mat_type} is "knn", then \code{knn_mat_method} is used to convert \code{mat} to a knn object.
-If \code{required_mat_type} is "adjacency", then \code{knn_mat_method} is used to first convert \code{mat} to a knn object,
+If \code{mat} is a standard matrix and \code{required_mat_type} is "knn", then \code{knn_obj_method} is used to convert \code{mat} to a knn object.
+If \code{required_mat_type} is "adjacency", then \code{knn_obj_method} is used to first convert \code{mat} to a knn object,
 then \code{knn_graph_method} is used to convert the knn object to a graph adjacency matrix.}
 }
 \value{

From 1030f77b08f4c4198ba201f795185c859e964277 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 27 Feb 2025 15:33:13 -0800
Subject: [PATCH 115/142] [r] add more comparisons between `IterativeLSI()` and
 ArchR's implementation

---
 r/R/singlecell_utils.R | 7 ++++++-
 r/man/IterativeLSI.Rd  | 5 +++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index b753f370..40fcfe0d 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -415,7 +415,10 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' There are some minor differences when compared to the ArchR implementation:
 #' - ArchR uses a different method for selecting features in the first iteration.  The default method is `select_features_variance`, which is the same as the ArchR implementation.
 #' `select_features_mean(normalize_method = binarize)` can be passed in for the `feature_selection_method` argument to mimic the ArchR implementation, if choosing to only run one iteration.
-#' - `IterativeLSI()` currently does not support utilization of different feature selection methods across each iteration.  
+#' - `IterativeLSI()` currently does not support utilization of different feature selection methods across each iteration.
+#' - ArchR uses a default of 25000 features picked during feature selection. As the number of input features is dependent on the input matrix fed into `IterativeLSI()`, 
+#' the default for `select_features_variance()` instead picks the number of variable features as a proportion of the total features provided.  To mimic the ArchR implementation,
+#' `feature_selection_method` can be set to `select_features_variance(num_feats = 25000)`.
 #' If one desires to use a different feature selection method for each iteration, they can take the cluster assignments from the previous iteration and use them to select features and run LSI.
 #' - ArchR calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
 #' which BPCells does not encounter even with a non-subsetted matrix. Therefore, IterativeLSI will run LSI on the entire matrix for each iteration.
@@ -425,6 +428,8 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' one can use the `project()` method with the `iteration` argument set to the desired iteration to get projected data.  This can then be fed into `uwot::umap()`
 #' - ArchR by default filters out PCs with a correlation to sequencing depth greater than 0.75. 
 #' While corr_cutoff is provided as an argument in `IterativeLSI()`, it is set to not removing any PCs by default.
+#' - ArchR by default filters out outliers dependent on number of accesible regions of cells, by the bottom and top quantiles.  This is not implemented in `IterativeLSI()`, 
+#' but can be done as a preprocessing step.
 #' @seealso `LSI()` `DimReduction()` `svds()` `knn_hnsw()` `knn_annoy()` 
 #' `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` `select_features_variance()` `select_features_dispersion()` 
 #' `select_features_mean()` `select_features_binned_dispersion()`
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index 13f3ef31..7ad9e2df 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -97,6 +97,9 @@ There are some minor differences when compared to the ArchR implementation:
 \item ArchR uses a different method for selecting features in the first iteration.  The default method is \code{select_features_variance}, which is the same as the ArchR implementation.
 \code{select_features_mean(normalize_method = binarize)} can be passed in for the \code{feature_selection_method} argument to mimic the ArchR implementation, if choosing to only run one iteration.
 \item \code{IterativeLSI()} currently does not support utilization of different feature selection methods across each iteration.
+\item ArchR uses a default of 25000 features picked during feature selection. As the number of input features is dependent on the input matrix fed into \code{IterativeLSI()},
+the default for \code{select_features_variance()} instead picks the number of variable features as a proportion of the total features provided.  To mimic the ArchR implementation,
+\code{feature_selection_method} can be set to \code{select_features_variance(num_feats = 25000)}.
 If one desires to use a different feature selection method for each iteration, they can take the cluster assignments from the previous iteration and use them to select features and run LSI.
 \item ArchR calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
 which BPCells does not encounter even with a non-subsetted matrix. Therefore, IterativeLSI will run LSI on the entire matrix for each iteration.
@@ -106,6 +109,8 @@ leiden, which should provide the same clustering results while being faster.
 one can use the \code{project()} method with the \code{iteration} argument set to the desired iteration to get projected data.  This can then be fed into \code{uwot::umap()}
 \item ArchR by default filters out PCs with a correlation to sequencing depth greater than 0.75.
 While corr_cutoff is provided as an argument in \code{IterativeLSI()}, it is set to not removing any PCs by default.
+\item ArchR by default filters out outliers dependent on number of accesible regions of cells, by the bottom and top quantiles.  This is not implemented in \code{IterativeLSI()},
+but can be done as a preprocessing step.
 }
 }
 \seealso{

From 019de7f87c0497b5d3b597bb2f319e27b2edcf55 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 27 Feb 2025 17:43:33 -0800
Subject: [PATCH 116/142] [r] fix miscellaneous syntax/reference
 inconsistencies for clustering

---
 r/R/clustering.R                       | 10 +++++-----
 r/man/cluster.Rd                       |  6 +++---
 r/man/convert_mat_to_cluster_matrix.Rd | 15 ++++++++-------
 r/man/knn_graph.Rd                     |  2 +-
 4 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/r/R/clustering.R b/r/R/clustering.R
index edb01c7b..2586d0a7 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -41,7 +41,7 @@ is_adjacency_matrix <- function(mat) {
 #' @param knn_graph_method (function) Function to convert a knn object to a graph adjacency matrix
 #'   (e.g., `knn_to_geodesic_graph`). Ignored if `required_mat_type` is "knn" or if
 #'   `mat` is already a graph matrix.
-#' #' @details
+#' @details
 #' This function checks the type of the input matrix `mat`. `mat` is returned without modification if
 #' it is already the required type (adjacency or knn).
 #' If `mat` is a standard matrix and `required_mat_type` is "knn", then `knn_obj_method` is used to convert `mat` to a knn object.
@@ -87,7 +87,7 @@ convert_mat_to_cluster_matrix <- function(
 #' @param use_weights boolean for whether to replace all distance weights with 1
 #' @param self_loops boolean for whether to allow cells to count themselves as neighbors
 #' @param knn_obj_method (function) if knn is not a knn object, this function will attempt to convert it to one. 
-#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn object.
+#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if `knn`` is already a knn object.
 #' @param threads (integer) Number of threads to use.
 #' @param verbose (logical) Whether to print progress information during search
 #' @return **knn_to_graph**
@@ -222,13 +222,13 @@ knn_to_geodesic_graph <- function(knn, knn_obj_method = knn_hnsw, return_type =
 #' @param objective_function Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below). 
 #'    For the meaning of each option, see `igraph::cluster_leiden()`.
 #' @param knn_obj_method (function) if mat represents a regular non-knn object, this function will attempt to convert it a knn object.
-#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if knn is already a knn object.
+#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if `mat` is already a knn object.
 #' @param knn_graph_method (function) if mat represents a knn object, this function will attempt to convert it to a graph matrix.
 #' Must be a (optionally partialized) version of `knn_to_graph()`, `knn_to_snn_graph()` or `knn_to_geodesic_graph()`.  
-#' Ignored if mat is already a graph adjacency matrix.
+#' Ignored if `mat` is already a graph adjacency matrix.
 #' @param seed Random seed for clustering initialization
 #' @param threads (integer) Number of threads to use.
-#' @param verbose (logical) Whether to print progress information during search
+#' @param verbose (logical) Whether to print progress information.
 #' @param ... Additional arguments to underlying clustering function
 #' @return Factor vector containing the cluster assignment for each cell.
 #' @export
diff --git a/r/man/cluster.Rd b/r/man/cluster.Rd
index 793df08f..9a7a2f65 100644
--- a/r/man/cluster.Rd
+++ b/r/man/cluster.Rd
@@ -47,17 +47,17 @@ cluster_graph_seurat(
 For the meaning of each option, see \code{igraph::cluster_leiden()}.}
 
 \item{knn_obj_method}{(function) if mat represents a regular non-knn object, this function will attempt to convert it a knn object.
-Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn object.}
+Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if \code{mat} is already a knn object.}
 
 \item{knn_graph_method}{(function) if mat represents a knn object, this function will attempt to convert it to a graph matrix.
 Must be a (optionally partialized) version of \code{knn_to_graph()}, \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}.
-Ignored if mat is already a graph adjacency matrix.}
+Ignored if \code{mat} is already a graph adjacency matrix.}
 
 \item{seed}{Random seed for clustering initialization}
 
 \item{threads}{(integer) Number of threads to use.}
 
-\item{verbose}{(logical) Whether to print progress information during search}
+\item{verbose}{(logical) Whether to print progress information.}
 
 \item{...}{Additional arguments to underlying clustering function}
 }
diff --git a/r/man/convert_mat_to_cluster_matrix.Rd b/r/man/convert_mat_to_cluster_matrix.Rd
index 2f18af38..57d263ba 100644
--- a/r/man/convert_mat_to_cluster_matrix.Rd
+++ b/r/man/convert_mat_to_cluster_matrix.Rd
@@ -23,13 +23,7 @@ Ignored if \code{mat} is already a knn object or graph matrix.}
 
 \item{knn_graph_method}{(function) Function to convert a knn object to a graph adjacency matrix
 (e.g., \code{knn_to_geodesic_graph}). Ignored if \code{required_mat_type} is "knn" or if
-\code{mat} is already a graph matrix.
-#' @details
-This function checks the type of the input matrix \code{mat}. \code{mat} is returned without modification if
-it is already the required type (adjacency or knn).
-If \code{mat} is a standard matrix and \code{required_mat_type} is "knn", then \code{knn_obj_method} is used to convert \code{mat} to a knn object.
-If \code{required_mat_type} is "adjacency", then \code{knn_obj_method} is used to first convert \code{mat} to a knn object,
-then \code{knn_graph_method} is used to convert the knn object to a graph adjacency matrix.}
+\code{mat} is already a graph matrix.}
 }
 \value{
 The converted matrix.
@@ -39,4 +33,11 @@ Ensures that the input matrix is converted to the correct type (knn or adjacency
 required by a clustering function. If the input is already of the correct type,
 it is returned as is.
 }
+\details{
+This function checks the type of the input matrix \code{mat}. \code{mat} is returned without modification if
+it is already the required type (adjacency or knn).
+If \code{mat} is a standard matrix and \code{required_mat_type} is "knn", then \code{knn_obj_method} is used to convert \code{mat} to a knn object.
+If \code{required_mat_type} is "adjacency", then \code{knn_obj_method} is used to first convert \code{mat} to a knn object,
+then \code{knn_graph_method} is used to convert the knn object to a graph adjacency matrix.
+}
 \keyword{internal}
diff --git a/r/man/knn_graph.Rd b/r/man/knn_graph.Rd
index 32a3b689..aef9d4c9 100644
--- a/r/man/knn_graph.Rd
+++ b/r/man/knn_graph.Rd
@@ -40,7 +40,7 @@ dist for cell x K neighbor distances}
 \item{use_weights}{boolean for whether to replace all distance weights with 1}
 
 \item{knn_obj_method}{(function) if knn is not a knn object, this function will attempt to convert it to one.
-Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if knn is already a knn object.}
+Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if `knn`` is already a knn object.}
 
 \item{self_loops}{Whether to allow self-loops in the output graph}
 

From 4cbf14721d1a6f72fa317ee239403213941da01a Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 27 Feb 2025 17:54:05 -0800
Subject: [PATCH 117/142] [r] fix `knn_obj_method` docs inconsistency, fix typo
 for `ef` param in `knn_hnsw()`

---
 r/R/clustering.R   | 4 ++--
 r/man/knn.Rd       | 2 +-
 r/man/knn_graph.Rd | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/r/R/clustering.R b/r/R/clustering.R
index 2586d0a7..b0ec96ed 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -87,7 +87,7 @@ convert_mat_to_cluster_matrix <- function(
 #' @param use_weights boolean for whether to replace all distance weights with 1
 #' @param self_loops boolean for whether to allow cells to count themselves as neighbors
 #' @param knn_obj_method (function) if knn is not a knn object, this function will attempt to convert it to one. 
-#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if `knn`` is already a knn object.
+#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if `knn` is already a knn object.
 #' @param threads (integer) Number of threads to use.
 #' @param verbose (logical) Whether to print progress information during search
 #' @return **knn_to_graph**
@@ -344,7 +344,7 @@ cluster_membership_matrix <- function(groups, group_order = NULL) {
 #' @param metric distance metric to use
 #' @param threads Number of threads to use. Note that result is non-deterministic
 #'          if threads > 1
-#' @param ef ef parameter for RccppHNSW::hnsw_search. Increase for slower search but
+#' @param ef ef parameter for `RcppHNSW::hnsw_search()`. Increase for slower search but
 #'          improved accuracy
 #' @param verbose whether to print progress information during search
 #' @return List of 2 matrices -- idx for cell x K neighbor indices,
diff --git a/r/man/knn.Rd b/r/man/knn.Rd
index b92cc250..7944fc7a 100644
--- a/r/man/knn.Rd
+++ b/r/man/knn.Rd
@@ -38,7 +38,7 @@ knn_annoy(
 \item{threads}{Number of threads to use. Note that result is non-deterministic
 if threads > 1}
 
-\item{ef}{ef parameter for RccppHNSW::hnsw_search. Increase for slower search but
+\item{ef}{ef parameter for \code{RcppHNSW::hnsw_search()}. Increase for slower search but
 improved accuracy}
 
 \item{n_trees}{Number of trees during index build time. More trees gives higher accuracy}
diff --git a/r/man/knn_graph.Rd b/r/man/knn_graph.Rd
index aef9d4c9..e4fab410 100644
--- a/r/man/knn_graph.Rd
+++ b/r/man/knn_graph.Rd
@@ -40,7 +40,7 @@ dist for cell x K neighbor distances}
 \item{use_weights}{boolean for whether to replace all distance weights with 1}
 
 \item{knn_obj_method}{(function) if knn is not a knn object, this function will attempt to convert it to one.
-Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if `knn`` is already a knn object.}
+Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if \code{knn} is already a knn object.}
 
 \item{self_loops}{Whether to allow self-loops in the output graph}
 

From a2ee2c639881f1d201f4f0892c2fdc13d715d43c Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 27 Feb 2025 18:04:18 -0800
Subject: [PATCH 118/142] [r] update `knn_obj_method` documentation

---
 r/R/clustering.R                       | 4 ++--
 r/man/cluster.Rd                       | 2 +-
 r/man/convert_mat_to_cluster_matrix.Rd | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/r/R/clustering.R b/r/R/clustering.R
index b0ec96ed..8fc8d8be 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -37,7 +37,7 @@ is_adjacency_matrix <- function(mat) {
 #' @param mat Input matrix to be converted.
 #' @param required_mat_type (character) Required matrix type: "adjacency" or "knn".
 #' @param knn_obj_method (function) Function to convert input to a knn object (e.g., `knn_hnsw`, `knn_annoy`).
-#'   Ignored if `mat` is already a knn object or graph matrix.
+#'   Ignored if `mat` is already a knn object or graph matrix, or if `mat` is a graph adjacency matrix.
 #' @param knn_graph_method (function) Function to convert a knn object to a graph adjacency matrix
 #'   (e.g., `knn_to_geodesic_graph`). Ignored if `required_mat_type` is "knn" or if
 #'   `mat` is already a graph matrix.
@@ -222,7 +222,7 @@ knn_to_geodesic_graph <- function(knn, knn_obj_method = knn_hnsw, return_type =
 #' @param objective_function Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below). 
 #'    For the meaning of each option, see `igraph::cluster_leiden()`.
 #' @param knn_obj_method (function) if mat represents a regular non-knn object, this function will attempt to convert it a knn object.
-#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if `mat` is already a knn object.
+#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if `mat` is already a knn object, or if `mat` is a graph adjacency matrix.
 #' @param knn_graph_method (function) if mat represents a knn object, this function will attempt to convert it to a graph matrix.
 #' Must be a (optionally partialized) version of `knn_to_graph()`, `knn_to_snn_graph()` or `knn_to_geodesic_graph()`.  
 #' Ignored if `mat` is already a graph adjacency matrix.
diff --git a/r/man/cluster.Rd b/r/man/cluster.Rd
index 9a7a2f65..e388116b 100644
--- a/r/man/cluster.Rd
+++ b/r/man/cluster.Rd
@@ -47,7 +47,7 @@ cluster_graph_seurat(
 For the meaning of each option, see \code{igraph::cluster_leiden()}.}
 
 \item{knn_obj_method}{(function) if mat represents a regular non-knn object, this function will attempt to convert it a knn object.
-Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if \code{mat} is already a knn object.}
+Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if \code{mat} is already a knn object, or if \code{mat} is a graph adjacency matrix.}
 
 \item{knn_graph_method}{(function) if mat represents a knn object, this function will attempt to convert it to a graph matrix.
 Must be a (optionally partialized) version of \code{knn_to_graph()}, \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}.
diff --git a/r/man/convert_mat_to_cluster_matrix.Rd b/r/man/convert_mat_to_cluster_matrix.Rd
index 57d263ba..8155a773 100644
--- a/r/man/convert_mat_to_cluster_matrix.Rd
+++ b/r/man/convert_mat_to_cluster_matrix.Rd
@@ -19,7 +19,7 @@ convert_mat_to_cluster_matrix(
 \item{required_mat_type}{(character) Required matrix type: "adjacency" or "knn".}
 
 \item{knn_obj_method}{(function) Function to convert input to a knn object (e.g., \code{knn_hnsw}, \code{knn_annoy}).
-Ignored if \code{mat} is already a knn object or graph matrix.}
+Ignored if \code{mat} is already a knn object or graph matrix, or if \code{mat} is a graph adjacency matrix.}
 
 \item{knn_graph_method}{(function) Function to convert a knn object to a graph adjacency matrix
 (e.g., \code{knn_to_geodesic_graph}). Ignored if \code{required_mat_type} is "knn" or if

From 49c497da848256ceb3c4031a9d257df5c18b12d6 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 27 Feb 2025 18:28:04 -0800
Subject: [PATCH 119/142] [r] fix math wording in `feature_selection`

---
 r/R/singlecell_utils.R     | 4 ++--
 r/man/feature_selection.Rd | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 40fcfe0d..a12973ff 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -31,8 +31,8 @@
 #' - `highly_variable`: Logical vector of whether the feature is highly variable.
 #'
 #' Each different feature selection method will have a different scoring method.
-#' For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, determine the score of
-#' each feature \eqn{x_i} as follows:
+#' Consider a matrix \eqn{X}, where the row index \eqn{i} refers to each feature
+#' and the column index \eqn{j} refers to each cell. For each feature \eqn{x_{i} \in X}, we define the following feature-selection scores:
 #' - `select_features_variance`: \eqn{\mathrm{Score}(x_i) = \frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}
 #' @examples
 #' set.seed(12345)
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index ccc4c50b..d80fefbd 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -57,8 +57,8 @@ Return a dataframe with the following columns:
 }
 
 Each different feature selection method will have a different scoring method.
-For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, determine the score of
-each feature \eqn{x_i} as follows:
+Consider a matrix \eqn{X}, where the row index \eqn{i} refers to each feature
+and the column index \eqn{j} refers to each cell. For each feature \eqn{x_{i} \in X}, we define the following feature-selection scores:
 \itemize{
 \item \code{select_features_variance}: \eqn{\mathrm{Score}(x_i) = \frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}
 }

From cf108a83c6d18e64fb9f939b75f88fab8f0120f3 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 28 Feb 2025 15:05:22 -0800
Subject: [PATCH 120/142] [r] clean up normalizations code, docs

---
 r/R/transforms.R   | 20 +++++++++++---------
 r/man/normalize.Rd |  7 ++++---
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/r/R/transforms.R b/r/R/transforms.R
index 65fbdaac..347ad926 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -951,8 +951,10 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #' @param mat (IterableMatrix) Counts matrix with dimensions `(features x cells)`.
 #' @param scale_factor (numeric) Scaling factor to multiply matrix by prior to normalization (see formulas below).
 #' @param threads (integer) Number of threads to use.
-#' @returns For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, 
-#'  transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as:
+#' @returns 
+#' Consider a matrix \eqn{X}, where the row index \eqn{i} refers to each feature
+#' and the column index \eqn{j} refers to each cell. For each element \eqn{{x}_{ij} \in X}, the 
+#' normalized value \eqn{\tilde{x}_{ij}} is calculated as:
 #'  
 #'   - `normalize_log`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
 #' @details 
@@ -982,14 +984,14 @@ normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
   assert_is_wholenumber(threads)
   if (rlang::is_missing(mat)) return(create_partial())
   assert_is_mat(mat)
-  read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
+  read_depth <- matrix_stats(mat, col_stats = "mean", threads = threads)$col_stats["mean", ] * nrow(mat)
   mat <- mat %>% multiply_cols(1 / read_depth)
   return(log1p(mat * scale_factor))
 }
 
 
 #' @rdname normalize
-#' @param feature_means (numeric, optional) Pre-calculated means of the features to normalize by (rowMeans(mat) by default). 
+#' @param feature_means (numeric, optional) Pre-calculated means of the features to normalize by (`rowMeans(mat)` by default). 
 #' If feature_means has names and mat has row names, match values by name. 
 #' Otherwise, assume feature_means has the same length and ordering as the matrix rows.
 #' @returns - `normalize_tfidf`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
@@ -1007,21 +1009,21 @@ normalize_tfidf <- function(
   assert_is_mat(mat)
   # If feature means are passed in, only need to calculate term frequency
   if (is.null(feature_means)) {
-    mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
+    mat_stats <- matrix_stats(mat, row_stats = "mean", col_stats = "mean", threads = threads)
     feature_means <- mat_stats$row_stats["mean", ]
     read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
   } else {
     assert_is_numeric(feature_means)
-    if (!is.null(names(feature_means)) && !is.null(rownames(mat))) {
+    if (!is.null(rownames(mat))) {
       # Make sure every name in feature means exists in rownames(mat)
-      # In the case there is a length mismatch but the feature names all exist in feature_means
-      # will not error out
       assert_true(all(rownames(mat) %in% names(feature_means)))
+      # subset feature_means to the rownames of mat in the case there is a length mismatch 
+      # but the feature names all exist in feature_means, 
       feature_means <- feature_means[rownames(mat)]
     } else {
       assert_len(feature_means, nrow(mat))
     }
-    read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
+    read_depth <- matrix_stats(mat, col_stats = "mean", threads = threads)$col_stats["mean", ] * nrow(mat)
   }
   tf <- mat %>% multiply_cols(1 / read_depth)
   idf <- 1 / feature_means
diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd
index db33c58d..d9a78a2c 100644
--- a/r/man/normalize.Rd
+++ b/r/man/normalize.Rd
@@ -16,13 +16,14 @@ normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L)
 
 \item{threads}{(integer) Number of threads to use.}
 
-\item{feature_means}{(numeric, optional) Pre-calculated means of the features to normalize by (rowMeans(mat) by default).
+\item{feature_means}{(numeric, optional) Pre-calculated means of the features to normalize by (\code{rowMeans(mat)} by default).
 If feature_means has names and mat has row names, match values by name.
 Otherwise, assume feature_means has the same length and ordering as the matrix rows.}
 }
 \value{
-For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
-transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as:
+Consider a matrix \eqn{X}, where the row index \eqn{i} refers to each feature
+and the column index \eqn{j} refers to each cell. For each element \eqn{{x}_{ij} \in X}, the
+normalized value \eqn{\tilde{x}_{ij}} is calculated as:
 \itemize{
 \item \code{normalize_log}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
 }

From beaef0ac1ffae8f889485fb3573bc0ef0afdf937 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 28 Feb 2025 17:03:56 -0800
Subject: [PATCH 121/142] [r] remove knn method from graph adjacency functions.
 rewrite `convert_mat_to_cluster_matrix()` to always convert to graph
 adjacency matrix. rename `convert_mat_to_cluster_matrix()` to
 `convert_mat_to_graph()`. Add input checking for knn to graph functions

---
 r/NEWS.md                                |   5 +-
 r/R/clustering.R                         | 127 +++++++++++------------
 r/R/singlecell_utils.R                   |   2 +-
 r/man/IterativeLSI.Rd                    |   2 +-
 r/man/cluster.Rd                         |  33 +++---
 r/man/convert_mat_to_cluster_matrix.Rd   |  43 --------
 r/man/convert_mat_to_graph.Rd            |  35 +++++++
 r/man/is_adjacency_matrix.Rd             |   4 +-
 r/man/knn_graph.Rd                       |  31 +-----
 r/tests/testthat/test-singlecell_utils.R |   6 +-
 10 files changed, 129 insertions(+), 159 deletions(-)
 delete mode 100644 r/man/convert_mat_to_cluster_matrix.Rd
 create mode 100644 r/man/convert_mat_to_graph.Rd

diff --git a/r/NEWS.md b/r/NEWS.md
index f50361da..9c3a0ff5 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -10,7 +10,7 @@ Contributions welcome :)
 
 ## Breaking changes
 - Change first parameter name of `cluster_graph_leiden()`, `cluster_graph_louvain()` and `cluster_graph_seurat()` from `snn` to `mat` to more accurately reflect the input type.  (pull request #189)
-- Added non-final parameters, `knn_obj_method` and `knn_graph_method` to `cluster_graph_leiden()`, `cluster_graph_louvain()` and `cluster_graph_seurat()`. (pull request #189)
+- Added parameters, `knn_method` and `knn_to_graph_method` to `cluster_graph_leiden()`, `cluster_graph_louvain()` and `cluster_graph_seurat()`. (pull request #189)
 
 ## Features
 - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166)
@@ -19,8 +19,7 @@ Contributions welcome :)
 - Add feature selection functions `select_features_variance()`, and `select_features_{dispersion,mean,binned_dispersion}()`, with parameterization for normalization steps, and number of variable features (pull request #189)
 - Add `LSI()` and `IterativeLSI()` dimensionality functions to perform latent semantic indexing on a matrix (pull request #189).
 - Add capability to create partial function objects in when excluding the first argument of a function.  This is implemented in normalizations, feature selections, dimensionality reductions, and clustering functions. See `select_features_variance()` for usage.  (pull request #189)
-- Allowed clustering functions `cluster_graph_leiden()`, `cluster_graph_louvain()`, and `cluster_graph_seurat()` to also perform knn object and graph adjacency construction intermediate steps with `knn_obj_method` and `knn_graph_method` parameters. Also provided
-`threads` and `verbose` arguments to clustering functions that are automatically passed down to knn object/graph adjacency construction steps. (pull request #189)
+- Allowed clustering functions `cluster_graph_leiden()`, `cluster_graph_louvain()`, and `cluster_graph_seurat()` to also perform knn object and graph adjacency intermediate construction steps with `knn_method` and `knn_to_graph_method` parameters. Also provided `threads` and `verbose` arguments to clustering functions that are automatically passed down to knn object/graph adjacency construction steps.  Baseline expectation of usage in clustering functions is changed to having users directly input a cell embedding matrix. (pull request #189)
 
 ## Improvements
 - Speed up taking large subsets of large concatenated matrices, e.g. selecting 9M cells from a 10M cell matrix composed of ~100 concatenated pieces. (pull request #179)
diff --git a/r/R/clustering.R b/r/R/clustering.R
index 8fc8d8be..02474aa7 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -20,54 +20,47 @@ is_knn_object <- function(mat) {
 
 #' Check if an input is a graph adjacency matrix.
 #'
-#' Clustering function like `cluster_graph_leiden()` and `cluster_graph_louvain()` require a graph adjacency matrix as input.
-#' We assume that any `dgCMatrix` that is square is a graph adjacency matrix.
+#' Clustering functions like `cluster_graph_leiden()` and `cluster_graph_louvain()` require a graph adjacency matrix as input.
+#' We assume that any square `dgCMatrix` is a graph adjacency matrix.
 #' @return TRUE if the mat is a graph adjacency matrix, FALSE otherwise
 #' @keywords internal
 is_adjacency_matrix <- function(mat) {
   return(is(mat, "dgCMatrix") && nrow(mat) == ncol(mat))
 }
 
-#' Convert a matrix to the required input type for clustering
+#' Converts a matrix to a graph adjacency matrix
 #'
-#' Ensures that the input matrix is converted to the correct type (knn or adjacency)
-#' required by a clustering function. If the input is already of the correct type,
-#' it is returned as is.
-#'
-#' @param mat Input matrix to be converted.
-#' @param required_mat_type (character) Required matrix type: "adjacency" or "knn".
-#' @param knn_obj_method (function) Function to convert input to a knn object (e.g., `knn_hnsw`, `knn_annoy`).
-#'   Ignored if `mat` is already a knn object or graph matrix, or if `mat` is a graph adjacency matrix.
-#' @param knn_graph_method (function) Function to convert a knn object to a graph adjacency matrix
-#'   (e.g., `knn_to_geodesic_graph`). Ignored if `required_mat_type` is "knn" or if
-#'   `mat` is already a graph matrix.
+#' Handles inputs so that cell embeddings and knn objects are converted into graph adjacency matrices.
+#' @param mat Input matrix to be converted of shape `(cells x n_embeddings)`
+#' @param knn_method (function) Function to convert input to a knn object (e.g., `knn_hnsw`, `knn_annoy`).
+#'   Ignored if `mat` is already a knn object or graph matrix.
+#' @param knn_to_graph_method (function) Function to convert a knn object to a graph adjacency matrix
+#'   (e.g., `knn_to_geodesic_graph`). Ignored if `mat` is already a graph matrix.
 #' @details
-#' This function checks the type of the input matrix `mat`. `mat` is returned without modification if
-#' it is already the required type (adjacency or knn).
-#' If `mat` is a standard matrix and `required_mat_type` is "knn", then `knn_obj_method` is used to convert `mat` to a knn object.
-#' If `required_mat_type` is "adjacency", then `knn_obj_method` is used to first convert `mat` to a knn object, 
-#' then `knn_graph_method` is used to convert the knn object to a graph adjacency matrix.
-#' @return The converted matrix.
+#' If `mat` is a `(cells x n_embeddings)` matrix then `knn_method` is used to convert `mat` to a knn object.
+#' Knn objects are then converted into a graph adjacency matrix using `knn_to_graph_method`. Also does intermediate checks to ensure 
+#' that `knn_method` and `knn_to_graph` properly converted matrix into a correct output.
+#' @return  Symmetric graph adjacency matrix (dgCMatrix)
 #' @keywords internal
-convert_mat_to_cluster_matrix <- function(
+convert_mat_to_graph <- function(
   mat,
-  required_mat_type = c("knn", "adjacency"),
-  knn_obj_method = knn_hnsw,
-  knn_graph_method = knn_to_geodesic_graph,
+  knn_method = knn_hnsw,
+  knn_to_graph_method = knn_to_geodesic_graph,
   threads = 1L,
   verbose = FALSE
 ) {
-  required_mat_type <- match.arg(required_mat_type)
   if (is(mat, "matrix")) {
-    mat <- partial_apply(knn_obj_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
+    mat <- partial_apply(knn_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
   }
-  if (required_mat_type == "knn" && !is_knn_object(mat)) {
+  if (!(is_knn_object(mat) || is_adjacency_matrix(mat))) {
     pretty_error(mat, "must be a knn object, or convertible to one", 1)
   }
-  if (required_mat_type == "adjacency" && is_knn_object(mat)) {
-    mat <- partial_apply(knn_graph_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
+  # There currently aren't any `knn_to_graph` functions that utilize a verbose argument.  
+  # However, we still pass `verbose` in case future functions do provide this functionality.
+  if (is_knn_object(mat)) {
+    mat <- partial_apply(knn_to_graph_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
   }
-  if (required_mat_type == "adjacency" && !is_adjacency_matrix(mat)) {
+  if (!is_adjacency_matrix(mat)) {
     pretty_error(mat, "must be a graph adjacency matrix, or convertible to one", 1)
   }
   return(mat)
@@ -86,20 +79,15 @@ convert_mat_to_cluster_matrix <- function(
 #'  dist for cell x K neighbor distances
 #' @param use_weights boolean for whether to replace all distance weights with 1
 #' @param self_loops boolean for whether to allow cells to count themselves as neighbors
-#' @param knn_obj_method (function) if knn is not a knn object, this function will attempt to convert it to one. 
-#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if `knn` is already a knn object.
-#' @param threads (integer) Number of threads to use.
-#' @param verbose (logical) Whether to print progress information during search
 #' @return **knn_to_graph**
 #'   Sparse matrix (dgCMatrix) where `mat[i,j]` = distance from cell `i` to
 #'   cell `j`, or 0 if cell `j` is not in the K nearest neighbors of `i`
-knn_to_graph <- function(knn, use_weights = FALSE, knn_obj_method = knn_hnsw, self_loops = TRUE, threads = 0L, verbose = FALSE) {
+knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) {
   assert_is(use_weights, "logical")
   assert_is(self_loops, "logical")
-  assert_is_wholenumber(threads)
-  assert_is(verbose, "logical")
+
   if (rlang::is_missing(knn)) return(create_partial())
-  mat <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_obj_method = knn_obj_method, threads = threads, verbose = verbose)
+  assert_true(is_knn_object(knn))
   if (use_weights) {
     weights <- knn$dist
   } else {
@@ -138,13 +126,11 @@ knn_to_graph <- function(knn, use_weights = FALSE, knn_obj_method = knn_hnsw, se
 #'      These correspond to the rows, cols, and values of non-zero entries in the lower triangle
 #'      adjacency matrix. `dim` is the total number of vertices (cells) in the graph 
 #' @export
-knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, knn_obj_method = knn_hnsw, return_type=c("matrix", "list"), threads = 0L, verbose = FALSE) {
+knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_type=c("matrix", "list")) {
   return_type <- match.arg(return_type)
   assert_is(self_loops, "logical")
-  assert_is_wholenumber(threads)
-  assert_is(verbose, "logical")
   if (rlang::is_missing(knn)) return(create_partial())
-  knn <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_obj_method = knn_obj_method, threads = threads, verbose = verbose)
+  assert_true(is_knn_object(knn))
   # Solve x / (2*K - x) >= min_val --> x >= 2*K*min_val / (1 + min_val)
   min_int <- ceiling(2*min_val*ncol(knn$idx) / (1 + min_val))
   snn <- build_snn_graph_cpp(knn$idx, min_neighbors = min_int)
@@ -173,6 +159,7 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, knn_obj_
 }
 
 #' @rdname knn_graph
+#' @param threads Number of threads to use during calculations
 #' @details **knn_to_geodesic_graph**
 #'  Convert a knn object into an undirected weighted graph, using the same 
 #'  geodesic distance estimation method as the UMAP package. 
@@ -193,11 +180,11 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, knn_obj_
 #'      These correspond to the rows, cols, and values of non-zero entries in the lower triangle
 #'      adjacency matrix. `dim` is the total number of vertices (cells) in the graph 
 #' @export
-knn_to_geodesic_graph <- function(knn, knn_obj_method = knn_hnsw, return_type = c("matrix", "list"), threads = 0L, verbose = FALSE) {
+knn_to_geodesic_graph <- function(knn, return_type = c("matrix", "list"), threads = 0L) {
   return_type <- match.arg(return_type)
   assert_is_wholenumber(threads)
   if (rlang::is_missing(knn)) return(create_partial())
-  knn <- convert_mat_to_cluster_matrix(knn, required_mat_type = "knn", knn_obj_method = knn_obj_method, threads = threads, verbose = verbose)
+  assert_true(is_knn_object(knn))
   graph <- build_umap_graph_cpp(knn$dist, knn$idx, threads=threads)
   
   graph$dim <- nrow(knn$idx)
@@ -211,37 +198,43 @@ knn_to_geodesic_graph <- function(knn, knn_obj_method = knn_hnsw, return_type =
   return(res)
 }
 
-#' Cluster an adjacency matrix
+#' Cluster a cell embedding matrix using a graph based algorithm
 #' @rdname cluster
 #' @details **cluster_graph_leiden**: Leiden clustering algorithm `igraph::cluster_leiden()`. 
 #'    Note that when using `objective_function = "CPM"` the number of clusters empirically scales with `cells * resolution`,
 #'    so 1e-3 is a good resolution for 10k cells, but 1M cells is better with a 1e-5 resolution. A resolution of 1 is a 
 #'    good default when `objective_function = "modularity"` per the default.
-#' @param mat Symmetric adjacency matrix (dgCMatrix) output from e.g. `knn_to_snn_graph()` or `knn_to_geodesic_graph()`. Only the lower triangle is used
+#' @param mat (matrix) `(cells x n_embeddings)` matrix from a dimensionality reduction.
+#' `mat` can also be a knn object (list), with names `idx` and `dist`, returned from a knn method (See `knn_hnsw()`, `knn_annoy()`).
+#' Additionally, `mat` can be a symmetric graph adjacency matrix (dgCMatrix) from e.g. `knn_to_snn_graph()` or `knn_to_geodesic_graph()`. 
+#' Only the lower triangle from a graph adjacency matrix is used.
 #' @param resolution Resolution parameter. Higher values result in more clusters
 #' @param objective_function Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below). 
 #'    For the meaning of each option, see `igraph::cluster_leiden()`.
-#' @param knn_obj_method (function) if mat represents a regular non-knn object, this function will attempt to convert it a knn object.
-#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if `mat` is already a knn object, or if `mat` is a graph adjacency matrix.
-#' @param knn_graph_method (function) if mat represents a knn object, this function will attempt to convert it to a graph matrix.
+#' @param knn_method (function) Function to convert `mat` from cell embeddings to a knn object.
+#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if `mat` is already a knn object/graph adjacency matrix.
+#' @param knn_to_graph_method (function) Function to convert `mat` from a knn object generated from `knn_method` to a graph adjacency matrix.
 #' Must be a (optionally partialized) version of `knn_to_graph()`, `knn_to_snn_graph()` or `knn_to_geodesic_graph()`.  
 #' Ignored if `mat` is already a graph adjacency matrix.
 #' @param seed Random seed for clustering initialization
-#' @param threads (integer) Number of threads to use.
-#' @param verbose (logical) Whether to print progress information.
+#' @param threads (integer) Number of threads to use in `knn_method` and `knn_to_graph_method`.  If these functions do not utilize
+#' a `threads` argument, this is silently ignored.
+#' @param verbose (logical) Whether to print progress information in `knn_method` and `knn_to_graph_method`.  If these functions do not utilize 
+#' a `verbose` argument, this is silently ignored.
 #' @param ... Additional arguments to underlying clustering function
 #' @return Factor vector containing the cluster assignment for each cell.
 #' @export
 cluster_graph_leiden <- function(
-  mat, resolution = 1, objective_function = c("modularity", "CPM"), 
-  knn_obj_method = knn_hnsw, knn_graph_method = knn_to_geodesic_graph, seed = 12531, threads = 0L, verbose = FALSE, ...
+  mat, resolution = 1, objective_function = c("modularity", "CPM"),
+  knn_method = knn_hnsw, knn_to_graph_method = knn_to_geodesic_graph, seed = 12531, threads = 0L, verbose = FALSE, ...
 ) {
   assert_has_package("igraph")
   # Set seed without permanently changing seed state
   if (rlang::is_missing(mat)) return(create_partial())
-  mat <- convert_mat_to_cluster_matrix(
-    mat, required_mat_type = "adjacency", knn_obj_method = knn_obj_method,
-    knn_graph_method = knn_graph_method, threads = threads, verbose = verbose
+  mat <- convert_mat_to_graph(
+    mat, knn_method = knn_method,
+    knn_to_graph_method = knn_to_graph_method, 
+    threads = threads, verbose = verbose
   )
   prev_seed <- get_seed()
   on.exit(restore_seed(prev_seed), add = TRUE)
@@ -260,15 +253,16 @@ cluster_graph_leiden <- function(
 #' @details **cluster_graph_louvain**: Louvain graph clustering algorithm `igraph::cluster_louvain()`
 #' @export
 cluster_graph_louvain <- function(
-  mat, resolution = 1, knn_obj_method = knn_hnsw, 
-  knn_graph_method = knn_to_geodesic_graph, seed = 12531, threads = 0L, verbose = FALSE
+  mat, resolution = 1, knn_method = knn_hnsw, 
+  knn_to_graph_method = knn_to_geodesic_graph, seed = 12531, threads = 0L, verbose = FALSE
 ) {
   assert_has_package("igraph")
   # Set seed without permanently changing seed state
   if (rlang::is_missing(mat)) return(create_partial())
-  mat <- convert_mat_to_cluster_matrix(
-    mat, required_mat_type = "adjacency", knn_obj_method = knn_obj_method,
-    knn_graph_method = knn_graph_method, threads = threads, verbose = verbose
+  mat <- convert_mat_to_graph(
+    mat, knn_method = knn_method,
+    knn_to_graph_method = knn_to_graph_method, 
+    threads = threads, verbose = verbose
   )
 
   prev_seed <- get_seed()
@@ -285,14 +279,15 @@ cluster_graph_louvain <- function(
 #' @details **cluster_graph_seurat**: Seurat's clustering algorithm `Seurat::FindClusters()`
 #' @export
 cluster_graph_seurat <- function(
-  mat, resolution = 0.8, knn_obj_method = knn_hnsw, 
-  knn_graph_method = knn_to_geodesic_graph, threads = 0L, verbose = FALSE, ...
+  mat, resolution = 0.8, knn_method = knn_hnsw, 
+  knn_to_graph_method = knn_to_geodesic_graph, threads = 0L, verbose = FALSE, ...
 ) {
   assert_has_package("Seurat")
   if (rlang::is_missing(mat)) return(create_partial())
-  mat <- convert_mat_to_cluster_matrix(
-    mat, required_mat_type = "adjacency", knn_obj_method = knn_obj_method,
-    knn_graph_method = knn_graph_method, threads = threads, verbose = verbose
+  mat <- convert_mat_to_graph(
+    mat, knn_method = knn_method,
+    knn_to_graph_method = knn_to_graph_method, 
+    threads = threads, verbose = verbose
   )
   Seurat::as.Graph(mat) %>%
     Seurat::FindClusters(resolution = resolution, ...) %>%
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index a12973ff..ec9dc948 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -384,7 +384,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @param cluster_method (function) Method to use for clustering the post-SVD matrix. 
 #' Current builtin options are `cluster_graph_{leiden, louvain, seurat}()`.
 #' The user can pass in partial parameters to the cluster method, such as by passing 
-#' `cluster_graph_leiden(resolution = 0.5, knn_obj_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))`.
+#' `cluster_graph_leiden(resolution = 0.5, knn_method = knn_hnsw(ef = 500, k = 12), knn_to_graph_method = knn_to_snn_graph(min_val = 0.1))`.
 #' @return 
 #' `IterativeLSI()` An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data as a matrix of shape `(cells, n_dimensions)`
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index 7ad9e2df..212140d3 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -37,7 +37,7 @@ the corr_cutoff, it will be excluded from the final PCA matrix.}
 \item{cluster_method}{(function) Method to use for clustering the post-SVD matrix.
 Current builtin options are \verb{cluster_graph_\{leiden, louvain, seurat\}()}.
 The user can pass in partial parameters to the cluster method, such as by passing
-\code{cluster_graph_leiden(resolution = 0.5, knn_obj_method = knn_hnsw(ef = 500, k = 12), knn_graph_method = knn_to_snn_graph(min_val = 0.1))}.}
+\code{cluster_graph_leiden(resolution = 0.5, knn_method = knn_hnsw(ef = 500, k = 12), knn_to_graph_method = knn_to_snn_graph(min_val = 0.1))}.}
 
 \item{threads}{(integer) Number of threads to use.}
 
diff --git a/r/man/cluster.Rd b/r/man/cluster.Rd
index e388116b..ebf80f8a 100644
--- a/r/man/cluster.Rd
+++ b/r/man/cluster.Rd
@@ -4,14 +4,14 @@
 \alias{cluster_graph_leiden}
 \alias{cluster_graph_louvain}
 \alias{cluster_graph_seurat}
-\title{Cluster an adjacency matrix}
+\title{Cluster a cell embedding matrix using a graph based algorithm}
 \usage{
 cluster_graph_leiden(
   mat,
   resolution = 1,
   objective_function = c("modularity", "CPM"),
-  knn_obj_method = knn_hnsw,
-  knn_graph_method = knn_to_geodesic_graph,
+  knn_method = knn_hnsw,
+  knn_to_graph_method = knn_to_geodesic_graph,
   seed = 12531,
   threads = 0L,
   verbose = FALSE,
@@ -21,8 +21,8 @@ cluster_graph_leiden(
 cluster_graph_louvain(
   mat,
   resolution = 1,
-  knn_obj_method = knn_hnsw,
-  knn_graph_method = knn_to_geodesic_graph,
+  knn_method = knn_hnsw,
+  knn_to_graph_method = knn_to_geodesic_graph,
   seed = 12531,
   threads = 0L,
   verbose = FALSE
@@ -31,33 +31,38 @@ cluster_graph_louvain(
 cluster_graph_seurat(
   mat,
   resolution = 0.8,
-  knn_obj_method = knn_hnsw,
-  knn_graph_method = knn_to_geodesic_graph,
+  knn_method = knn_hnsw,
+  knn_to_graph_method = knn_to_geodesic_graph,
   threads = 0L,
   verbose = FALSE,
   ...
 )
 }
 \arguments{
-\item{mat}{Symmetric adjacency matrix (dgCMatrix) output from e.g. \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}. Only the lower triangle is used}
+\item{mat}{(matrix) \verb{(cells x n_embeddings)} matrix from a dimensionality reduction.
+\code{mat} can also be a knn object (list), with names \code{idx} and \code{dist}, returned from a knn method (See \code{knn_hnsw()}, \code{knn_annoy()}).
+Additionally, \code{mat} can be a symmetric graph adjacency matrix (dgCMatrix) from e.g. \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}.
+Only the lower triangle from a graph adjacency matrix is used.}
 
 \item{resolution}{Resolution parameter. Higher values result in more clusters}
 
 \item{objective_function}{Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below).
 For the meaning of each option, see \code{igraph::cluster_leiden()}.}
 
-\item{knn_obj_method}{(function) if mat represents a regular non-knn object, this function will attempt to convert it a knn object.
-Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if \code{mat} is already a knn object, or if \code{mat} is a graph adjacency matrix.}
+\item{knn_method}{(function) Function to convert \code{mat} from cell embeddings to a knn object.
+Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if \code{mat} is already a knn object/graph adjacency matrix.}
 
-\item{knn_graph_method}{(function) if mat represents a knn object, this function will attempt to convert it to a graph matrix.
+\item{knn_to_graph_method}{(function) Function to convert \code{mat} from a knn object generated from \code{knn_method} to a graph adjacency matrix.
 Must be a (optionally partialized) version of \code{knn_to_graph()}, \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}.
 Ignored if \code{mat} is already a graph adjacency matrix.}
 
 \item{seed}{Random seed for clustering initialization}
 
-\item{threads}{(integer) Number of threads to use.}
+\item{threads}{(integer) Number of threads to use in \code{knn_method} and \code{knn_to_graph_method}.  If these functions do not utilize
+a \code{threads} argument, this is silently ignored.}
 
-\item{verbose}{(logical) Whether to print progress information.}
+\item{verbose}{(logical) Whether to print progress information in \code{knn_method} and \code{knn_to_graph_method}.  If these functions do not utilize
+a \code{verbose} argument, this is silently ignored.}
 
 \item{...}{Additional arguments to underlying clustering function}
 }
@@ -65,7 +70,7 @@ Ignored if \code{mat} is already a graph adjacency matrix.}
 Factor vector containing the cluster assignment for each cell.
 }
 \description{
-Cluster an adjacency matrix
+Cluster a cell embedding matrix using a graph based algorithm
 }
 \details{
 \strong{cluster_graph_leiden}: Leiden clustering algorithm \code{igraph::cluster_leiden()}.
diff --git a/r/man/convert_mat_to_cluster_matrix.Rd b/r/man/convert_mat_to_cluster_matrix.Rd
deleted file mode 100644
index 8155a773..00000000
--- a/r/man/convert_mat_to_cluster_matrix.Rd
+++ /dev/null
@@ -1,43 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/clustering.R
-\name{convert_mat_to_cluster_matrix}
-\alias{convert_mat_to_cluster_matrix}
-\title{Convert a matrix to the required input type for clustering}
-\usage{
-convert_mat_to_cluster_matrix(
-  mat,
-  required_mat_type = c("knn", "adjacency"),
-  knn_obj_method = knn_hnsw,
-  knn_graph_method = knn_to_geodesic_graph,
-  threads = 1L,
-  verbose = FALSE
-)
-}
-\arguments{
-\item{mat}{Input matrix to be converted.}
-
-\item{required_mat_type}{(character) Required matrix type: "adjacency" or "knn".}
-
-\item{knn_obj_method}{(function) Function to convert input to a knn object (e.g., \code{knn_hnsw}, \code{knn_annoy}).
-Ignored if \code{mat} is already a knn object or graph matrix, or if \code{mat} is a graph adjacency matrix.}
-
-\item{knn_graph_method}{(function) Function to convert a knn object to a graph adjacency matrix
-(e.g., \code{knn_to_geodesic_graph}). Ignored if \code{required_mat_type} is "knn" or if
-\code{mat} is already a graph matrix.}
-}
-\value{
-The converted matrix.
-}
-\description{
-Ensures that the input matrix is converted to the correct type (knn or adjacency)
-required by a clustering function. If the input is already of the correct type,
-it is returned as is.
-}
-\details{
-This function checks the type of the input matrix \code{mat}. \code{mat} is returned without modification if
-it is already the required type (adjacency or knn).
-If \code{mat} is a standard matrix and \code{required_mat_type} is "knn", then \code{knn_obj_method} is used to convert \code{mat} to a knn object.
-If \code{required_mat_type} is "adjacency", then \code{knn_obj_method} is used to first convert \code{mat} to a knn object,
-then \code{knn_graph_method} is used to convert the knn object to a graph adjacency matrix.
-}
-\keyword{internal}
diff --git a/r/man/convert_mat_to_graph.Rd b/r/man/convert_mat_to_graph.Rd
new file mode 100644
index 00000000..da5cfd38
--- /dev/null
+++ b/r/man/convert_mat_to_graph.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/clustering.R
+\name{convert_mat_to_graph}
+\alias{convert_mat_to_graph}
+\title{Converts a matrix to a graph adjacency matrix}
+\usage{
+convert_mat_to_graph(
+  mat,
+  knn_method = knn_hnsw,
+  knn_to_graph_method = knn_to_geodesic_graph,
+  threads = 1L,
+  verbose = FALSE
+)
+}
+\arguments{
+\item{mat}{Input matrix to be converted of shape \verb{(cells x n_embeddings)}}
+
+\item{knn_method}{(function) Function to convert input to a knn object (e.g., \code{knn_hnsw}, \code{knn_annoy}).
+Ignored if \code{mat} is already a knn object or graph matrix.}
+
+\item{knn_to_graph_method}{(function) Function to convert a knn object to a graph adjacency matrix
+(e.g., \code{knn_to_geodesic_graph}). Ignored if \code{mat} is already a graph matrix.}
+}
+\value{
+Symmetric graph adjacency matrix (dgCMatrix)
+}
+\description{
+Handles inputs so that cell embeddings and knn objects are converted into graph adjacency matrices.
+}
+\details{
+If \code{mat} is a \verb{(cells x n_embeddings)} matrix then \code{knn_method} is used to convert \code{mat} to a knn object.
+Knn objects are then converted into a graph adjacency matrix using \code{knn_to_graph_method}. Also does intermediate checks to ensure
+that \code{knn_method} and \code{knn_to_graph} properly converted matrix into a correct output.
+}
+\keyword{internal}
diff --git a/r/man/is_adjacency_matrix.Rd b/r/man/is_adjacency_matrix.Rd
index 3b01e66a..c44ce669 100644
--- a/r/man/is_adjacency_matrix.Rd
+++ b/r/man/is_adjacency_matrix.Rd
@@ -10,7 +10,7 @@ is_adjacency_matrix(mat)
 TRUE if the mat is a graph adjacency matrix, FALSE otherwise
 }
 \description{
-Clustering function like \code{cluster_graph_leiden()} and \code{cluster_graph_louvain()} require a graph adjacency matrix as input.
-We assume that any \code{dgCMatrix} that is square is a graph adjacency matrix.
+Clustering functions like \code{cluster_graph_leiden()} and \code{cluster_graph_louvain()} require a graph adjacency matrix as input.
+We assume that any square \code{dgCMatrix} is a graph adjacency matrix.
 }
 \keyword{internal}
diff --git a/r/man/knn_graph.Rd b/r/man/knn_graph.Rd
index e4fab410..5d365c41 100644
--- a/r/man/knn_graph.Rd
+++ b/r/man/knn_graph.Rd
@@ -6,32 +6,16 @@
 \alias{knn_to_geodesic_graph}
 \title{K Nearest Neighbor (KNN) Graph}
 \usage{
-knn_to_graph(
-  knn,
-  use_weights = FALSE,
-  knn_obj_method = knn_hnsw,
-  self_loops = TRUE,
-  threads = 0L,
-  verbose = FALSE
-)
+knn_to_graph(knn, use_weights = FALSE, self_loops = TRUE)
 
 knn_to_snn_graph(
   knn,
   min_val = 1/15,
   self_loops = FALSE,
-  knn_obj_method = knn_hnsw,
-  return_type = c("matrix", "list"),
-  threads = 0L,
-  verbose = FALSE
+  return_type = c("matrix", "list")
 )
 
-knn_to_geodesic_graph(
-  knn,
-  knn_obj_method = knn_hnsw,
-  return_type = c("matrix", "list"),
-  threads = 0L,
-  verbose = FALSE
-)
+knn_to_geodesic_graph(knn, return_type = c("matrix", "list"), threads = 0L)
 }
 \arguments{
 \item{knn}{List of 2 matrices -- idx for cell x K neighbor indices,
@@ -39,19 +23,14 @@ dist for cell x K neighbor distances}
 
 \item{use_weights}{boolean for whether to replace all distance weights with 1}
 
-\item{knn_obj_method}{(function) if knn is not a knn object, this function will attempt to convert it to one.
-Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if \code{knn} is already a knn object.}
-
 \item{self_loops}{Whether to allow self-loops in the output graph}
 
-\item{threads}{(integer) Number of threads to use.}
-
-\item{verbose}{(logical) Whether to print progress information during search}
-
 \item{min_val}{minimum jaccard index between neighbors. Values below this will
 round to 0}
 
 \item{return_type}{Whether to return a sparse adjacency matrix or an edge list}
+
+\item{threads}{Number of threads to use during calculations}
 }
 \value{
 \strong{knn_to_graph}
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 3ec08e93..b21311cc 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -244,7 +244,7 @@ test_that("Iterative LSI works", {
   mat <- matrix(data = runif(50000, 0, 1), nrow=500, ncol = 100) %>% as("dgCMatrix") %>% as("IterableMatrix")
   rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
   colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
-  lsi_res_obj <- expect_no_error(IterativeLSI(mat, n_iterations = 2, n_dimensions = 10L, cluster_method = cluster_graph_louvain(knn_obj_method = knn_hnsw)))
+  lsi_res_obj <- expect_no_error(IterativeLSI(mat, n_iterations = 2, n_dimensions = 10L, cluster_method = cluster_graph_louvain(knn_method = knn_hnsw)))
   lsi_res_proj <- project(lsi_res_obj, mat)
   lsi_res_proj_iter_1 <- expect_no_error(project(lsi_res_obj, mat, iteration = 1L))
   lsi_res_embedding <- lsi_res_obj$cell_embeddings
@@ -264,8 +264,8 @@ test_that("Iterative LSI works with parameterized clustering", {
     IterativeLSI(
       mat, n_dimensions = 10L,,
       cluster_method = cluster_graph_leiden(
-        knn_obj_method = knn_annoy(k = 12),
-        knn_graph_method = knn_to_snn_graph(min_val = 0.1)
+        knn_method = knn_annoy(k = 12),
+        knn_to_graph_method = knn_to_snn_graph(min_val = 0.1)
       )
     )
   )

From 307b2c5371249546ebae42b86421e30880857222 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 28 Feb 2025 17:08:57 -0800
Subject: [PATCH 122/142] [r] clean up `binarize()` examples

---
 r/R/transforms.R  | 3 ++-
 r/man/binarize.Rd | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/r/R/transforms.R b/r/R/transforms.R
index 347ad926..22a0fa09 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -307,7 +307,8 @@ setMethod("short_description", "TransformBinarize", function(x) {
 #' as(mat_binarized, "dgCMatrix")
 #' 
 #' # We can also call as a partialized function
-#' binarize(threshold = 0.1)(mat)
+#' partial_binarize <- binarize(threshold = 0.1)
+#' partial_binarize(mat)
 #' @export
 binarize <- function(mat, threshold = 0, strict_inequality = TRUE) {
   assert_is(threshold, "numeric")
diff --git a/r/man/binarize.Rd b/r/man/binarize.Rd
index a112c00c..fd8f2f2a 100644
--- a/r/man/binarize.Rd
+++ b/r/man/binarize.Rd
@@ -41,5 +41,6 @@ mat_binarized
 as(mat_binarized, "dgCMatrix")
 
 # We can also call as a partialized function
-binarize(threshold = 0.1)(mat)
+partial_binarize <- binarize(threshold = 0.1)
+partial_binarize(mat)
 }

From 28bb79e1695ed4161b8b8dacf8f3b9a85984d270 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 28 Feb 2025 17:10:06 -0800
Subject: [PATCH 123/142] [r] rename `cluster` Rd to `cluster_graph`

---
 r/R/clustering.R                       | 6 +++---
 r/man/{cluster.Rd => cluster_graph.Rd} | 0
 2 files changed, 3 insertions(+), 3 deletions(-)
 rename r/man/{cluster.Rd => cluster_graph.Rd} (100%)

diff --git a/r/R/clustering.R b/r/R/clustering.R
index 02474aa7..1b82cec3 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -199,7 +199,7 @@ knn_to_geodesic_graph <- function(knn, return_type = c("matrix", "list"), thread
 }
 
 #' Cluster a cell embedding matrix using a graph based algorithm
-#' @rdname cluster
+#' @rdname cluster_graph
 #' @details **cluster_graph_leiden**: Leiden clustering algorithm `igraph::cluster_leiden()`. 
 #'    Note that when using `objective_function = "CPM"` the number of clusters empirically scales with `cells * resolution`,
 #'    so 1e-3 is a good resolution for 10k cells, but 1M cells is better with a 1e-5 resolution. A resolution of 1 is a 
@@ -249,7 +249,7 @@ cluster_graph_leiden <- function(
 }
 
 
-#' @rdname cluster
+#' @rdname cluster_graph
 #' @details **cluster_graph_louvain**: Louvain graph clustering algorithm `igraph::cluster_louvain()`
 #' @export
 cluster_graph_louvain <- function(
@@ -275,7 +275,7 @@ cluster_graph_louvain <- function(
     as.factor()
 }
 
-#' @rdname cluster
+#' @rdname cluster_graph
 #' @details **cluster_graph_seurat**: Seurat's clustering algorithm `Seurat::FindClusters()`
 #' @export
 cluster_graph_seurat <- function(
diff --git a/r/man/cluster.Rd b/r/man/cluster_graph.Rd
similarity index 100%
rename from r/man/cluster.Rd
rename to r/man/cluster_graph.Rd

From e89b4675e2b318af9ac6b5553356085a741f092f Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 3 Mar 2025 14:06:25 -0800
Subject: [PATCH 124/142] [r] clean up lsi code clarity, documentation

---
 r/R/singlecell_utils.R | 52 +++++++++++++++++++++---------------------
 r/man/IterativeLSI.Rd  | 23 ++++++++++---------
 r/man/LSI.Rd           |  6 ++---
 3 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index ec9dc948..f2b6a5b6 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -264,18 +264,18 @@ project.default <- function(x, mat, ...) {
 #' @rdname LSI
 #' @param mat (IterableMatrix) Counts matrix of shape `(features x cells)`.
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
-#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
+#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is greater or equal to
 #' the corr_cutoff, it will be excluded from the final PCA matrix.
 #' @param scale_factor (numeric) Scaling factor to multiply matrix by prior to log normalization (see formulas below).
 #' @param threads (integer) Number of threads to use.
 #' @returns 
 #' `LSI()` An object of class `c("LSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data as a matrix of shape `(cells, n_dimensions)`
-#' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
+#' - `fitted_params`: A named list of the parameters used for LSI, including the following:
 #'   - `scale_factor`: The scale factor used for tf-idf normalization
 #'   - `feature_means`: The means of the features used for normalization
 #'   - `pcs_to_keep`: The PCs that were kept after filtering by correlation to sequencing depth
-#'   - `feature_loadings`: SVD component u with dimension `(n_variable_features, n_dimensions)`
+#'   - `feature_loadings`: SVD component with dimension `(n_variable_features, n_dimensions)` used to linearly transform input data into cell embeddings
 #' - `feature_names`: The names of the features in the matrix
 #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
 #' 
@@ -309,7 +309,7 @@ LSI <- function(
     scale_factor = scale_factor,
     threads = threads
   )(mat)
-  # Save to prevent re-calculation of queued operations
+  # Save to prevent repeating queued calculations during SVD
   mat <- write_matrix_dir(
     convert_matrix_type(mat, type = "float"),
     tempfile("mat"), compress = TRUE
@@ -324,7 +324,7 @@ LSI <- function(
   pca_feats_to_keep <- which(pca_corrs < corr_cutoff)
   if (length(pca_feats_to_keep) != n_dimensions) {
     if (verbose) log_progress(sprintf("Dropping PCs %s due to high correlation with sequencing depth", paste(setdiff(1:n_dimensions, pca_feats_to_keep), collapse = ", ")))
-    pca_res <- pca_res[, pca_feats_to_keep] %>% as.matrix()
+    pca_res <- pca_res[, pca_feats_to_keep, drop = FALSE]
   }
   fitted_params <- list(
     scale_factor = scale_factor,
@@ -371,7 +371,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 }
 
 
-#' Run iterative LSI on a matrix.
+#' Run Iterative LSI on a matrix.
 #' 
 #' Given a `(features x cells)` counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.  
 #' This uses the method described in [ArchR](https://doi.org/10.1038/s41588-021-00790-6) (Granja et al; 2019). 
@@ -379,7 +379,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @rdname IterativeLSI
 #' @param mat (IterableMatrix) Counts matrix of shape `(features x cells)`.
 #' @param n_iterations (int) The number of LSI iterations to perform.
-#' @param feature_selection_method (function) Method to use for selecting features for each iteration after the first. 
+#' @param feature_selection_method (function) Method to use for selecting features for LSI. 
 #' Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_mean`, `select_features_binned_dispersion`
 #' @param cluster_method (function) Method to use for clustering the post-SVD matrix. 
 #' Current builtin options are `cluster_graph_{leiden, louvain, seurat}()`.
@@ -390,6 +390,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' - `cell_embeddings`: The projected data as a matrix of shape `(cells, n_dimensions)`
 #' - `fitted_params`: A list of the parameters used for iterative LSI.  Includes the following:
 #'    - `lsi_method`: The method used for LSI
+#'    - `feature_selection_method`: The method used for selecting features
 #'    - `cluster_method`: The method used for clustering
 #'    - `feature_means`: The means of the features used for tf-idf normalization
 #'    - `iterations`: The number of LSI iterations ran
@@ -407,28 +408,28 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #'    - If `n_iterations` is 1, return the projected data from the first PCA projection
 #'    - Else, cluster the LSI results using `cluster_method`
 #' - For each subsequent iteration:
-#'    - Pseudobulk the clusters and select the top features based on the variance of the pseudobulked clusters
+#'    - Pseudobulk the clusters and select the top features based on the pseudobulked clusters
 #'    - Perform LSI on the selected features
 #'    - If this is the final iteration, return the projected data from this PCA projection
 #'    - Else, cluster the LSI results using `cluster_method`
 #' 
 #' There are some minor differences when compared to the ArchR implementation:
-#' - ArchR uses a different method for selecting features in the first iteration.  The default method is `select_features_variance`, which is the same as the ArchR implementation.
-#' `select_features_mean(normalize_method = binarize)` can be passed in for the `feature_selection_method` argument to mimic the ArchR implementation, if choosing to only run one iteration.
-#' - `IterativeLSI()` currently does not support utilization of different feature selection methods across each iteration.
+#' - ArchR binarizes data prior to feature selection.  To replicate this, the user can pass `select_features_variance(normalize=binarize)` for their `feature_selection_method`.
+#' - `IterativeLSI()` currently does not support utilization of different feature selection methods across each iteration. 
+#' If one desires to use a different feature selection method for each iteration, they can take the cluster assignments from the previous 
+#' iteration and use them to select features and run LSI.
 #' - ArchR uses a default of 25000 features picked during feature selection. As the number of input features is dependent on the input matrix fed into `IterativeLSI()`, 
 #' the default for `select_features_variance()` instead picks the number of variable features as a proportion of the total features provided.  To mimic the ArchR implementation,
 #' `feature_selection_method` can be set to `select_features_variance(num_feats = 25000)`.
-#' If one desires to use a different feature selection method for each iteration, they can take the cluster assignments from the previous iteration and use them to select features and run LSI.
 #' - ArchR calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
 #' which BPCells does not encounter even with a non-subsetted matrix. Therefore, IterativeLSI will run LSI on the entire matrix for each iteration.
 #' - ArchR defaults on using Seurat clustering for default, which utilizes the Louvain algorithm (See `Seurat::FindClusters()`).  In constrast, `IterativeLSI()` utilizes
 #' leiden, which should provide the same clustering results while being faster.
-#' - ArchR also plots a umap of every iteration's dimensionality reduction.  While this is not implemented in `IterativeLSI()`,
+#' - ArchR also plots and calculates a umap of every iteration's dimensionality reduction.  While this is not implemented in `IterativeLSI()`,
 #' one can use the `project()` method with the `iteration` argument set to the desired iteration to get projected data.  This can then be fed into `uwot::umap()`
-#' - ArchR by default filters out PCs with a correlation to sequencing depth greater than 0.75. 
+#' - ArchR filters out PCs with a correlation to sequencing depth greater than 0.75. 
 #' While corr_cutoff is provided as an argument in `IterativeLSI()`, it is set to not removing any PCs by default.
-#' - ArchR by default filters out outliers dependent on number of accesible regions of cells, by the bottom and top quantiles.  This is not implemented in `IterativeLSI()`, 
+#' - ArchR filters out outliers dependent on number of accesible regions of cells, by the bottom and top quantiles.  This is not implemented in `IterativeLSI()`, 
 #' but can be done as a preprocessing step.
 #' @seealso `LSI()` `DimReduction()` `svds()` `knn_hnsw()` `knn_annoy()` 
 #' `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` `select_features_variance()` `select_features_dispersion()` 
@@ -454,6 +455,7 @@ IterativeLSI <- function(
   fitted_params <- list(
     lsi_method = LSI(n_dimensions = n_dimensions, corr_cutoff = corr_cutoff, scale_factor = scale_factor, threads = threads),
     cluster_method = cluster_method,
+    feature_selection_method = feature_selection_method,
     feature_means = matrix_stats(mat, row_stats = "mean", threads = threads)$row_stats["mean",],
     iterations = n_iterations,
     iter_info = tibble::tibble(
@@ -464,18 +466,17 @@ IterativeLSI <- function(
       clusters = list()
     )
   )
+  feature_selection_method <- partial_apply(feature_selection_method, threads = threads, .missing_args_error = FALSE)
   if (verbose) log_progress("Starting Iterative LSI")
   for (i in seq_len(n_iterations)) {
     
     if (verbose) log_progress(sprintf("Starting Iterative LSI iteration %s of %s", i, n_iterations))
-    # add a blank row to the iter_info tibble
-    
     # run variable feature selection
     if (verbose) log_progress("Selecting features")
     if (i == 1) {
-      var_feature_table <- partial_apply(feature_selection_method, threads = threads, .missing_args_error = FALSE)(mat)
+      var_feature_table <- feature_selection_method(mat)
     } else {
-      var_feature_table <- partial_apply(feature_selection_method, threads = threads, .missing_args_error = FALSE)(pseudobulk_res)
+      var_feature_table <- feature_selection_method(pseudobulk_res)
     }
     variable_features <- var_feature_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(feature)
     if (is.character(variable_features)) {
@@ -494,24 +495,23 @@ IterativeLSI <- function(
       threads = threads,
       verbose = verbose
     )
-    # only cluster + pseudobulk if this isn't the last iteration
     fitted_params$iter_info <- tibble::add_row(
       fitted_params$iter_info, iteration = i,
       feature_names = list(variable_features),
       feature_loadings = list(lsi_res_obj$fitted_params$feature_loadings),
       pcs_to_keep = list(lsi_res_obj$fitted_params$pcs_to_keep)
     )
+    # only cluster + pseudobulk if this isn't the last iteration
     if (i == n_iterations) break
     
     # cluster the LSI results
     if (verbose) log_progress("Clustering LSI results")
-    clustering_res <- lsi_res_obj$cell_embeddings[, lsi_res_obj$fitted_params$pcs_to_keep] %>%
-      as.matrix() %>%
+    clustering_res <- lsi_res_obj$cell_embeddings[, lsi_res_obj$fitted_params$pcs_to_keep, drop = FALSE] %>%
       partial_apply(cluster_method, threads = threads, .missing_args_error = FALSE)()
     fitted_params$iter_info$clusters[[i]] <- clustering_res
     # pseudobulk and pass onto next iteration
     if (verbose) log_progress("Pseudobulking matrix")
-    pseudobulk_res <- pseudobulk_matrix(mat, clustering_res, threads = as.integer(threads))
+    pseudobulk_res <- pseudobulk_matrix(mat, clustering_res, threads = threads)
     rownames(pseudobulk_res) <- rownames(mat)
   }
   if (verbose) log_progress("Finished running Iterative LSI")
@@ -526,15 +526,15 @@ IterativeLSI <- function(
 #' @rdname IterativeLSI
 #' @param iteration (integer) Which iteration of `IterativeLSI`'s features and loadings to use for projection.
 #' @return
-#' `project()` IterableMatrix of the projected data of shape `(cells, n_dimensions)`.
+#' `project()` Matrix of the projected data of shape `(cells, n_dimensions)`.
 #' @inheritParams project
 #' @export
 project.IterativeLSI <- function(x, mat, iteration = x$fitted_params$iterations, threads = 1L, ...) {
   assert_is_mat(mat)
   assert_is_wholenumber(threads)
-  fitted_params <- x$fitted_params
-  # Get the desired row of iter_info tibble
   assert_is_wholenumber(iteration)
+  fitted_params <- x$fitted_params
+  # Get the desired iteration row of iter_info tibble
   assert_true(iteration <= x$fitted_params$iterations)
   iter_info <- fitted_params$iter_info[iteration, ]
 
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index 212140d3..0837d279 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -3,7 +3,7 @@
 \name{IterativeLSI}
 \alias{IterativeLSI}
 \alias{project.IterativeLSI}
-\title{Run iterative LSI on a matrix.}
+\title{Run Iterative LSI on a matrix.}
 \usage{
 IterativeLSI(
   mat,
@@ -24,14 +24,14 @@ IterativeLSI(
 
 \item{n_iterations}{(int) The number of LSI iterations to perform.}
 
-\item{feature_selection_method}{(function) Method to use for selecting features for each iteration after the first.
+\item{feature_selection_method}{(function) Method to use for selecting features for LSI.
 Current builtin options are \code{select_features_variance}, \code{select_features_dispersion}, \code{select_features_mean}, \code{select_features_binned_dispersion}}
 
 \item{scale_factor}{(numeric) Scaling factor to multiply matrix by prior to log normalization (see formulas below).}
 
 \item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
 
-\item{corr_cutoff}{(numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
+\item{corr_cutoff}{(numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is greater or equal to
 the corr_cutoff, it will be excluded from the final PCA matrix.}
 
 \item{cluster_method}{(function) Method to use for clustering the post-SVD matrix.
@@ -52,6 +52,7 @@ The user can pass in partial parameters to the cluster method, such as by passin
 \item \code{fitted_params}: A list of the parameters used for iterative LSI.  Includes the following:
 \itemize{
 \item \code{lsi_method}: The method used for LSI
+\item \code{feature_selection_method}: The method used for selecting features
 \item \code{cluster_method}: The method used for clustering
 \item \code{feature_means}: The means of the features used for tf-idf normalization
 \item \code{iterations}: The number of LSI iterations ran
@@ -66,7 +67,7 @@ The user can pass in partial parameters to the cluster method, such as by passin
 }
 }
 
-\code{project()} IterableMatrix of the projected data of shape \verb{(cells, n_dimensions)}.
+\code{project()} Matrix of the projected data of shape \verb{(cells, n_dimensions)}.
 }
 \description{
 Given a \verb{(features x cells)} counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
@@ -85,7 +86,7 @@ The Iterative LSI method is as follows:
 }
 \item For each subsequent iteration:
 \itemize{
-\item Pseudobulk the clusters and select the top features based on the variance of the pseudobulked clusters
+\item Pseudobulk the clusters and select the top features based on the pseudobulked clusters
 \item Perform LSI on the selected features
 \item If this is the final iteration, return the projected data from this PCA projection
 \item Else, cluster the LSI results using \code{cluster_method}
@@ -94,22 +95,22 @@ The Iterative LSI method is as follows:
 
 There are some minor differences when compared to the ArchR implementation:
 \itemize{
-\item ArchR uses a different method for selecting features in the first iteration.  The default method is \code{select_features_variance}, which is the same as the ArchR implementation.
-\code{select_features_mean(normalize_method = binarize)} can be passed in for the \code{feature_selection_method} argument to mimic the ArchR implementation, if choosing to only run one iteration.
+\item ArchR binarizes data prior to feature selection.  To replicate this, the user can pass \code{select_features_variance(normalize=binarize)} for their \code{feature_selection_method}.
 \item \code{IterativeLSI()} currently does not support utilization of different feature selection methods across each iteration.
+If one desires to use a different feature selection method for each iteration, they can take the cluster assignments from the previous
+iteration and use them to select features and run LSI.
 \item ArchR uses a default of 25000 features picked during feature selection. As the number of input features is dependent on the input matrix fed into \code{IterativeLSI()},
 the default for \code{select_features_variance()} instead picks the number of variable features as a proportion of the total features provided.  To mimic the ArchR implementation,
 \code{feature_selection_method} can be set to \code{select_features_variance(num_feats = 25000)}.
-If one desires to use a different feature selection method for each iteration, they can take the cluster assignments from the previous iteration and use them to select features and run LSI.
 \item ArchR calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
 which BPCells does not encounter even with a non-subsetted matrix. Therefore, IterativeLSI will run LSI on the entire matrix for each iteration.
 \item ArchR defaults on using Seurat clustering for default, which utilizes the Louvain algorithm (See \code{Seurat::FindClusters()}).  In constrast, \code{IterativeLSI()} utilizes
 leiden, which should provide the same clustering results while being faster.
-\item ArchR also plots a umap of every iteration's dimensionality reduction.  While this is not implemented in \code{IterativeLSI()},
+\item ArchR also plots and calculates a umap of every iteration's dimensionality reduction.  While this is not implemented in \code{IterativeLSI()},
 one can use the \code{project()} method with the \code{iteration} argument set to the desired iteration to get projected data.  This can then be fed into \code{uwot::umap()}
-\item ArchR by default filters out PCs with a correlation to sequencing depth greater than 0.75.
+\item ArchR filters out PCs with a correlation to sequencing depth greater than 0.75.
 While corr_cutoff is provided as an argument in \code{IterativeLSI()}, it is set to not removing any PCs by default.
-\item ArchR by default filters out outliers dependent on number of accesible regions of cells, by the bottom and top quantiles.  This is not implemented in \code{IterativeLSI()},
+\item ArchR filters out outliers dependent on number of accesible regions of cells, by the bottom and top quantiles.  This is not implemented in \code{IterativeLSI()},
 but can be done as a preprocessing step.
 }
 }
diff --git a/r/man/LSI.Rd b/r/man/LSI.Rd
index ec3d1039..153952f0 100644
--- a/r/man/LSI.Rd
+++ b/r/man/LSI.Rd
@@ -21,7 +21,7 @@ LSI(
 
 \item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
 
-\item{corr_cutoff}{(numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
+\item{corr_cutoff}{(numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is greater or equal to
 the corr_cutoff, it will be excluded from the final PCA matrix.}
 
 \item{scale_factor}{(numeric) Scaling factor to multiply matrix by prior to log normalization (see formulas below).}
@@ -34,12 +34,12 @@ the corr_cutoff, it will be excluded from the final PCA matrix.}
 \code{LSI()} An object of class \code{c("LSI", "DimReduction")} with the following attributes:
 \itemize{
 \item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(cells, n_dimensions)}
-\item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
+\item \code{fitted_params}: A named list of the parameters used for LSI, including the following:
 \itemize{
 \item \code{scale_factor}: The scale factor used for tf-idf normalization
 \item \code{feature_means}: The means of the features used for normalization
 \item \code{pcs_to_keep}: The PCs that were kept after filtering by correlation to sequencing depth
-\item \code{feature_loadings}: SVD component u with dimension \verb{(n_variable_features, n_dimensions)}
+\item \code{feature_loadings}: SVD component with dimension \verb{(n_variable_features, n_dimensions)} used to linearly transform input data into cell embeddings
 }
 \item \code{feature_names}: The names of the features in the matrix
 }

From 8de044e7bc2b69b8b0c694818a1acb5c3dfaaabb Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 3 Mar 2025 14:33:03 -0800
Subject: [PATCH 125/142] [r] update LSI documentation

---
 r/R/singlecell_utils.R | 5 +++--
 r/man/IterativeLSI.Rd  | 2 +-
 r/man/LSI.Rd           | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index f2b6a5b6..47614edb 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -266,7 +266,7 @@ project.default <- function(x, mat, ...) {
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
 #' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is greater or equal to
 #' the corr_cutoff, it will be excluded from the final PCA matrix.
-#' @param scale_factor (numeric) Scaling factor to multiply matrix by prior to log normalization (see formulas below).
+#' @param scale_factor (numeric) Scaling factor to multiply matrix by during tf-idf normalization.
 #' @param threads (integer) Number of threads to use.
 #' @returns 
 #' `LSI()` An object of class `c("LSI", "DimReduction")` with the following attributes:
@@ -305,10 +305,11 @@ LSI <- function(
   mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"), threads = threads)
   read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
   mat <- normalize_tfidf(
+    mat = mat,
     feature_means = mat_stats$row_stats["mean", ],
     scale_factor = scale_factor,
     threads = threads
-  )(mat)
+  )
   # Save to prevent repeating queued calculations during SVD
   mat <- write_matrix_dir(
     convert_matrix_type(mat, type = "float"),
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index 0837d279..c3526ddb 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -27,7 +27,7 @@ IterativeLSI(
 \item{feature_selection_method}{(function) Method to use for selecting features for LSI.
 Current builtin options are \code{select_features_variance}, \code{select_features_dispersion}, \code{select_features_mean}, \code{select_features_binned_dispersion}}
 
-\item{scale_factor}{(numeric) Scaling factor to multiply matrix by prior to log normalization (see formulas below).}
+\item{scale_factor}{(numeric) Scaling factor to multiply matrix by during tf-idf normalization.}
 
 \item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
 
diff --git a/r/man/LSI.Rd b/r/man/LSI.Rd
index 153952f0..2210f081 100644
--- a/r/man/LSI.Rd
+++ b/r/man/LSI.Rd
@@ -24,7 +24,7 @@ LSI(
 \item{corr_cutoff}{(numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is greater or equal to
 the corr_cutoff, it will be excluded from the final PCA matrix.}
 
-\item{scale_factor}{(numeric) Scaling factor to multiply matrix by prior to log normalization (see formulas below).}
+\item{scale_factor}{(numeric) Scaling factor to multiply matrix by during tf-idf normalization.}
 
 \item{threads}{(integer) Number of threads to use.}
 

From adb17492fb2964f464ba826a813f7bc504d0ee6f Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 3 Mar 2025 14:46:46 -0800
Subject: [PATCH 126/142] [r] expand docstrings for `IterativeLSI()`

---
 r/R/singlecell_utils.R | 1 +
 r/man/IterativeLSI.Rd  | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 47614edb..8e0710f3 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -386,6 +386,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' Current builtin options are `cluster_graph_{leiden, louvain, seurat}()`.
 #' The user can pass in partial parameters to the cluster method, such as by passing 
 #' `cluster_graph_leiden(resolution = 0.5, knn_method = knn_hnsw(ef = 500, k = 12), knn_to_graph_method = knn_to_snn_graph(min_val = 0.1))`.
+#' @param threads (integer) Number of threads to use.  Also gets passed down into `feature_selection_method` and `cluster_method`
 #' @return 
 #' `IterativeLSI()` An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data as a matrix of shape `(cells, n_dimensions)`
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index c3526ddb..0635ea68 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -39,7 +39,7 @@ Current builtin options are \verb{cluster_graph_\{leiden, louvain, seurat\}()}.
 The user can pass in partial parameters to the cluster method, such as by passing
 \code{cluster_graph_leiden(resolution = 0.5, knn_method = knn_hnsw(ef = 500, k = 12), knn_to_graph_method = knn_to_snn_graph(min_val = 0.1))}.}
 
-\item{threads}{(integer) Number of threads to use.}
+\item{threads}{(integer) Number of threads to use.  Also gets passed down into \code{feature_selection_method} and \code{cluster_method}}
 
 \item{x}{DimReduction object.}
 

From 80e2ad2aa4d4f8f2bf83cc5c72abfe4663843d09 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 3 Mar 2025 14:48:19 -0800
Subject: [PATCH 127/142] [r] clean up `knn_to_graph_method` param in
 `cluster_graph*()` functions

---
 r/R/clustering.R       | 2 +-
 r/man/cluster_graph.Rd | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/r/R/clustering.R b/r/R/clustering.R
index 1b82cec3..cf09ceff 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -213,7 +213,7 @@ knn_to_geodesic_graph <- function(knn, return_type = c("matrix", "list"), thread
 #'    For the meaning of each option, see `igraph::cluster_leiden()`.
 #' @param knn_method (function) Function to convert `mat` from cell embeddings to a knn object.
 #' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if `mat` is already a knn object/graph adjacency matrix.
-#' @param knn_to_graph_method (function) Function to convert `mat` from a knn object generated from `knn_method` to a graph adjacency matrix.
+#' @param knn_to_graph_method (function) Function to convert `mat` from a knn object to a graph adjacency matrix.
 #' Must be a (optionally partialized) version of `knn_to_graph()`, `knn_to_snn_graph()` or `knn_to_geodesic_graph()`.  
 #' Ignored if `mat` is already a graph adjacency matrix.
 #' @param seed Random seed for clustering initialization
diff --git a/r/man/cluster_graph.Rd b/r/man/cluster_graph.Rd
index ebf80f8a..a7ef707f 100644
--- a/r/man/cluster_graph.Rd
+++ b/r/man/cluster_graph.Rd
@@ -52,7 +52,7 @@ For the meaning of each option, see \code{igraph::cluster_leiden()}.}
 \item{knn_method}{(function) Function to convert \code{mat} from cell embeddings to a knn object.
 Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if \code{mat} is already a knn object/graph adjacency matrix.}
 
-\item{knn_to_graph_method}{(function) Function to convert \code{mat} from a knn object generated from \code{knn_method} to a graph adjacency matrix.
+\item{knn_to_graph_method}{(function) Function to convert \code{mat} from a knn object to a graph adjacency matrix.
 Must be a (optionally partialized) version of \code{knn_to_graph()}, \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}.
 Ignored if \code{mat} is already a graph adjacency matrix.}
 

From a737385225342d3d4962a6f448eeb70f902b3ce3 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 3 Mar 2025 14:55:23 -0800
Subject: [PATCH 128/142] [r] clean up `select_features*()` documentation

---
 r/R/singlecell_utils.R     | 5 ++---
 r/man/feature_selection.Rd | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 8e0710f3..80aa94f9 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -21,9 +21,8 @@
 #' @param mat (IterableMatrix) Counts matrix with dimensions `(features x cells)`.
 #' @param num_feats (float) Number of features to mark as highly_variable. If 0 < num_feats < 1, then interpret it as a fraction of features.
 #' @param normalize_method (function) Used to normalize the matrix prior to feature selection by calling `normalize_method(mat)` if it is not NULL. 
-#' For example, pass normalize_log() or normalize_tfidf(). 
-#' If the normalize function accepts a threads argument, that will passed as normalize(mat, threads=threads).
-#' @param threads (integer) Number of threads to use.
+#' For example, pass `normalize_log()` or `normalize_tfidf()`. 
+#' @param threads (integer) Number of threads to use. Also overrides the threads argument in `normalize_method`
 #' @returns
 #' Return a dataframe with the following columns:
 #' - `feature`: Feature name.
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index d80fefbd..62edcc3b 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -41,10 +41,9 @@ select_features_binned_dispersion(
 \item{num_feats}{(float) Number of features to mark as highly_variable. If 0 < num_feats < 1, then interpret it as a fraction of features.}
 
 \item{normalize_method}{(function) Used to normalize the matrix prior to feature selection by calling \code{normalize_method(mat)} if it is not NULL.
-For example, pass normalize_log() or normalize_tfidf().
-If the normalize function accepts a threads argument, that will passed as normalize(mat, threads=threads).}
+For example, pass \code{normalize_log()} or \code{normalize_tfidf()}.}
 
-\item{threads}{(integer) Number of threads to use.}
+\item{threads}{(integer) Number of threads to use. Also overrides the threads argument in \code{normalize_method}}
 
 \item{n_bins}{(integer) Number of bins to split features into in order to control for the relationship between mean expression and dispersion (see details).}
 }

From 6de7df5044d3f4560fbb45b392b3f3197da9f4d0 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 3 Mar 2025 17:32:16 -0800
Subject: [PATCH 129/142] [r] revert broken case in `normalize_tfidf()` with
 non-null row means

---
 r/R/transforms.R                          |  2 +-
 r/tests/testthat/test-matrix_transforms.R | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/r/R/transforms.R b/r/R/transforms.R
index 22a0fa09..15d8c388 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -1015,7 +1015,7 @@ normalize_tfidf <- function(
     read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
   } else {
     assert_is_numeric(feature_means)
-    if (!is.null(rownames(mat))) {
+    if (!is.null(names(feature_means)) && !is.null(rownames(mat))) {
       # Make sure every name in feature means exists in rownames(mat)
       assert_true(all(rownames(mat) %in% names(feature_means)))
       # subset feature_means to the rownames of mat in the case there is a length mismatch 
diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R
index 24cd9c23..a8655375 100644
--- a/r/tests/testthat/test-matrix_transforms.R
+++ b/r/tests/testthat/test-matrix_transforms.R
@@ -362,6 +362,8 @@ test_that("tf-idf normalization works", {
     row_means_shuffled <- row_means[sample(1:length(row_means))]
     # Test that row means can have an extra element as long as all rownames are in the vector
     row_means_plus_one <- c(row_means, row6 = 1)
+    # Check when row means has no row names
+    row_means_no_names <- unname(row_means)
     
 
     res <- normalize_tfidf(m2)
@@ -376,6 +378,17 @@ test_that("tf-idf normalization works", {
 
     res_with_row_means_with_extra_element <- normalize_tfidf(m2, feature_means = row_means_plus_one)
     expect_identical(res, res_with_row_means_with_extra_element)
+    
+    # Check cases where names exists in either row means xor rownames(mat)
+    # check where both don't have names
+    res_with_unnamed_row_means <- normalize_tfidf(m2, feature_means = row_means_no_names)
+    expect_identical(res_with_unnamed_row_means, res)
+    rownames(m2) <- NULL
+    res_with_unnamed_row_names <- normalize_tfidf(m2, feature_means = row_means)
+    res_with_unnamed_row_names_row_means <- normalize_tfidf(m2, feature_means = row_means_no_names)
+    rownames(res) <- NULL
+    expect_identical(as(res_with_unnamed_row_names, "dgCMatrix"), as(res, "dgCMatrix"))
+    expect_identical(as(res_with_unnamed_row_names_row_means, "dgCMatrix"), as(res, "dgCMatrix"))
 })
 
 test_that("normalize_log works", {

From aa5d9a25af215420fcd47fcf030e1df3d07b080e Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 3 Mar 2025 17:33:10 -0800
Subject: [PATCH 130/142] [r] remove functionality of `assert_is_mat()` when
 passed multiple matrices

---
 r/R/errorChecking.R                   | 12 ++----------
 r/tests/testthat/test-errorChecking.R |  2 +-
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/r/R/errorChecking.R b/r/R/errorChecking.R
index 6ef4174d..ed24087d 100644
--- a/r/R/errorChecking.R
+++ b/r/R/errorChecking.R
@@ -115,16 +115,8 @@ assert_is <- function(object, class, n = 1) {
 }
 assert_is_mat <- function(object, n = 1) {
   # matrices have length set to row*col instead of being 1, so we need to check dim as well
-  if (length(object) == 1 || !is.null(dim(object))) {
-    if (!is(object, "IterableMatrix") && !canCoerce(object, "IterableMatrix")) {
-      pretty_error(object, "must either be an IterableMatrix or coercible to an IterableMatrix", n)
-    }
-  } else {
-    for (mat in object) {
-      if (!is(mat, "IterableMatrix") && !canCoerce(mat, "IterableMatrix")) {
-        pretty_error(mat, "must either be an IterableMatrix or coercible to an IterableMatrix", n)
-      }
-    }
+  if (!canCoerce(object, "IterableMatrix")) {
+    pretty_error(object, "must either be an IterableMatrix or coercible to an IterableMatrix", n)
   }
 }
 assert_true <- function(expr, n = 1) {
diff --git a/r/tests/testthat/test-errorChecking.R b/r/tests/testthat/test-errorChecking.R
index afcc6a01..355a5d29 100644
--- a/r/tests/testthat/test-errorChecking.R
+++ b/r/tests/testthat/test-errorChecking.R
@@ -12,7 +12,7 @@ test_that("assert_is_mat works", {
     mat_iterable <- as(mat, "IterableMatrix")
     expect_no_error(assert_is_mat(mat))
     expect_no_error(assert_is_mat(mat_dgc))
-    expect_no_error(assert_is_mat(c(mat_iterable, mat_iterable)))
+    expect_error(assert_is_mat(c(mat_iterable, mat_iterable)))
     expect_error(assert_is_mat("a"))
     expect_error(assert_is_mat(c("a", "a")))
     expect_error(assert_is_mat(1))

From 1b588dd9de534f7dac8a1420d8929f0c4a4de23a Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 3 Mar 2025 17:51:53 -0800
Subject: [PATCH 131/142] [r] clean up svd call in `LSI()`

---
 r/R/singlecell_utils.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 80aa94f9..78bb35b2 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -317,7 +317,8 @@ LSI <- function(
   # Run pca
   if (verbose) log_progress("Calculating SVD")
   svd_attr <- svds(mat, k = n_dimensions, threads = threads)
-  pca_res <- t(mat) %*% svd_attr$u
+  pca_res <- svd_attr$v %*% diag(svd_attr$d)
+  rownames(pca_res) <- colnames(mat)
   
   # Filter out PCs that are highly correlated with sequencing depth
   pca_corrs <- abs(cor(read_depth, pca_res))

From 4016ddf6a0dd9bcff9d4ef456c3798e60b430f5d Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 5 Mar 2025 13:40:43 -0800
Subject: [PATCH 132/142] [r] add in POC clustering wrapper

---
 r/R/clustering.R                   | 56 ++++++++++++++++++++++++++++
 r/man/cluster_cells_graph.Rd       | 60 ++++++++++++++++++++++++++++++
 r/pkgdown/_pkgdown.yml             |  1 +
 r/tests/testthat/test-clustering.R | 19 ++++++++++
 4 files changed, 136 insertions(+)
 create mode 100644 r/man/cluster_cells_graph.Rd

diff --git a/r/R/clustering.R b/r/R/clustering.R
index cf09ceff..a9d7a237 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -66,6 +66,62 @@ convert_mat_to_graph <- function(
   return(mat)
 }
 
+#' Cluster Embeddings using a kNN-Graph Based Community Algorithm
+#' 
+#' Take in a cell embedding matrix, and sequentially convert it into a kNN object, then to
+#' a graph adjacency matrix.  Following, a community detection algorithm assigns a cluster label
+#' to every cell.
+#' 
+#' @param mat (matrix) Cell embeddings matrix of shape `(cells x n_embeddings)`
+#' @param knn_method (function) Function to convert input to a knn object (e.g., `knn_hnsw`, `knn_annoy`).
+#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`. 
+#' @param knn_to_graph_method (function) Function to convert the knn object returned from `knn_method` to a graph adjacency matrix.
+#' Must be a (optionally partialized) version of `knn_to_graph()`, `knn_to_snn_graph()` or `knn_to_geodesic_graph()`.  
+#' @param graph_to_cluster_method (function) Community detection algorithm that converts a graph adjacency matrix 
+#' returned from `graph_to_cluster_method` into cluster labels for each cell.
+#' Must be a (optionally partialized) version of `cluster_graph_leiden()`, `cluster_graph_louvain()` or `cluster_graph_seurat()`.
+#' @param threads (integer) Number of threads to use in `knn_method`, `knn_to_graph_method` and `graph_to_cluster_method`.  If these functions do not utilize
+#' a `threads` argument, this is silently ignored.
+#' @param verbose (logical) Whether to print progress information in `knn_method`, `knn_to_graph_method` and `graph_to_cluster_method`.  If these functions do not utilize 
+#' a `verbose` argument, this is silently ignored.
+#' @returns (factor) Factor vector containing the cluster assignment for each cell.
+#' @details 
+#' `cluster_cells_graph()` acts as a helper function to wrap input creation and `kNN` graph adjacency-based clustering to be done together.  The user
+#' can also manually pass cell embeddings to their preferred knn/clustering functions of choices.  
+#' 
+#' **Clustering customization through partialized parameters**
+#' 
+#' Customization of clustering is possible through partialization of each parameter in `cluster_cells_graph()` that is a function.  
+#' In detail, each parameter that requests a function 
+#' may take in one with only some of the arguments provided.  If the first argument is not provided, a copy of a function is utilized that has its parameters
+#' changed with the arguments provided.
+#' 
+#' For instance, if the user desires for `cluster_cells_graph()` to instead use `cluster_graph_louvain()` with resolution different than the default,
+#' they can instead call `cluster_cells_graph()` like so: 
+#' `cluster_cells_graph(mat, graph_to_cluster_method = cluter_graph_louvain(resolution = 0.5))`
+#' @seealso `knn_hnsw()` `knn_annoy()` `knn_to_graph()` `knn_to_snn_graph()` `knn_to_geodesic_graph()` `cluster_graph_leiden()` `knn_to_snn_graph()` `knn_to_geodesic_graph()`
+cluster_cells_graph <- function(
+  mat, knn_method = knn_hnsw, 
+  knn_to_graph_method = knn_to_geodesic_graph, 
+  graph_to_cluster_method = cluster_graph_leiden, 
+  threads = 0L, verbose = FALSE
+) {
+  assert_true(is.matrix(mat))
+  assert_is_wholenumber(threads)
+  assert_is(verbose, "logical")
+  # There currently aren't any `knn_to_graph` functions that utilize a verbose argument.  
+  # However, we still pass `verbose` in case future functions do provide this functionality.
+  if (is(mat, "matrix")) {
+    mat <- partial_apply(knn_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
+  }
+  if (!is_knn_object(mat)) pretty_error(mat, "`knn_method` was unable to convert `mat` into a knn object", 1)
+  # Return type has to be constrained to "matrix", so this is silently provided.
+  mat <- partial_apply(knn_to_graph_method, threads = threads, verbose = verbose, return_type = "matrix", .missing_args_error = FALSE)(mat)
+  if (!is_adjacency_matrix(mat)) pretty_error(mat, "`knn_to_graph_method` was unable to convert `mat` from a knn object to a graph adjacency matrix", 1)
+  # Also pass verbose and threads to clustering functions in case they are given these params in the future
+  mat <- partial_apply(graph_to_cluster_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
+  return(mat)
+}
 
 #' K Nearest Neighbor (KNN) Graph
 #'
diff --git a/r/man/cluster_cells_graph.Rd b/r/man/cluster_cells_graph.Rd
new file mode 100644
index 00000000..1ac69d21
--- /dev/null
+++ b/r/man/cluster_cells_graph.Rd
@@ -0,0 +1,60 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/clustering.R
+\name{cluster_cells_graph}
+\alias{cluster_cells_graph}
+\title{Cluster Embeddings using a kNN-Graph Based Community Algorithm}
+\usage{
+cluster_cells_graph(
+  mat,
+  knn_method = knn_hnsw,
+  knn_to_graph_method = knn_to_geodesic_graph,
+  graph_to_cluster_method = cluster_graph_leiden,
+  threads = 0L,
+  verbose = FALSE
+)
+}
+\arguments{
+\item{mat}{(matrix) Cell embeddings matrix of shape \verb{(cells x n_embeddings)}}
+
+\item{knn_method}{(function) Function to convert input to a knn object (e.g., \code{knn_hnsw}, \code{knn_annoy}).
+Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.}
+
+\item{knn_to_graph_method}{(function) Function to convert the knn object returned from \code{knn_method} to a graph adjacency matrix.
+Must be a (optionally partialized) version of \code{knn_to_graph()}, \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}.}
+
+\item{graph_to_cluster_method}{(function) Community detection algorithm that converts a graph adjacency matrix
+returned from \code{graph_to_cluster_method} into cluster labels for each cell.
+Must be a (optionally partialized) version of \code{cluster_graph_leiden()}, \code{cluster_graph_louvain()} or \code{cluster_graph_seurat()}.}
+
+\item{threads}{(integer) Number of threads to use in \code{knn_method}, \code{knn_to_graph_method} and \code{graph_to_cluster_method}.  If these functions do not utilize
+a \code{threads} argument, this is silently ignored.}
+
+\item{verbose}{(logical) Whether to print progress information in \code{knn_method}, \code{knn_to_graph_method} and \code{graph_to_cluster_method}.  If these functions do not utilize
+a \code{verbose} argument, this is silently ignored.}
+}
+\value{
+(factor) Factor vector containing the cluster assignment for each cell.
+}
+\description{
+Take in a cell embedding matrix, and sequentially convert it into a kNN object, then to
+a graph adjacency matrix.  Following, a community detection algorithm assigns a cluster label
+to every cell.
+}
+\details{
+\code{cluster_cells_graph()} acts as a helper function to wrap input creation and \code{kNN} graph adjacency-based clustering to be done together.  The user
+can also manually pass cell embeddings to their preferred knn/clustering functions of choices.
+
+\strong{Clustering customization through partialized parameters}
+
+Customization of clustering is possible through partialization of each parameter in \code{cluster_cells_graph()} that is a function.
+In detail, each parameter that requests a function
+may take in one with only some of the arguments provided.  If the first argument is not provided, a copy of a function is utilized that has its parameters
+changed with the arguments provided.
+
+For instance, if the user desires for \code{cluster_cells_graph()} to instead use \code{cluster_graph_louvain()} with resolution different than the default,
+they can instead call \code{cluster_cells_graph()} like so:
+\code{cluster_cells_graph(mat, graph_to_cluster_method = cluter_graph_louvain(resolution = 0.5))}
+}
+\seealso{
+\code{knn_hnsw()} \code{knn_annoy()} \code{knn_to_graph()} \code{knn_to_snn_graph()} \code{knn_to_geodesic_graph()} \code{cluster_graph_leiden()} \code{knn_to_snn_graph()} \code{knn_to_geodesic_graph()}
+}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 0d21a1a0..612d817f 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -142,6 +142,7 @@ reference:
   - select_features_variance
 - subtitle: "Clustering"
 - contents:
+  - cluster_cells_graph
   - knn_hnsw
   - cluster_graph_leiden
   - knn_to_graph
diff --git a/r/tests/testthat/test-clustering.R b/r/tests/testthat/test-clustering.R
index 0da83bc7..d5a692c0 100644
--- a/r/tests/testthat/test-clustering.R
+++ b/r/tests/testthat/test-clustering.R
@@ -72,4 +72,23 @@ test_that("igraph clustering doesn't crash", {
     })
 
     expect_no_condition(cluster_graph_louvain(graph))
+})
+
+test_that("cluster_cells_graph works", {
+    skip_if_not_installed("RcppAnnoy")
+    skip_if_not_installed("RcppHNSW")
+    mat <- matrix(sample.int(1000, 10000, replace=TRUE), nrow=1000)
+    # check with default params
+    res <- expect_no_error(cluster_cells_graph(mat))
+    # check with threads, method partialization
+    expect_true(class(res) == "factor")
+    expect_equal(nrow(mat), length(res))
+    res_partialized <- expect_no_error(
+        cluster_cells_graph(
+            mat, knn_method = knn_annoy(k = 9),
+            knn_to_graph_method = knn_to_snn_graph(min_val = 1/10),
+            graph_to_cluster_method = cluster_graph_louvain(resolution = 0.8),
+        ))
+    expect_true(class(res) == "factor")
+    expect_equal(nrow(mat), length(res))
 })
\ No newline at end of file

From 64b6df4a57966efb8459c8c6c1798958065b22b6 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 5 Mar 2025 14:52:05 -0800
Subject: [PATCH 133/142] [r] change clustering functions to only accept graph
 adjacency matrix integrate `cluster_cells_graph()` into codebase

---
 r/NEWS.md                                |  3 +-
 r/R/clustering.R                         | 92 ++++--------------------
 r/R/singlecell_utils.R                   | 11 ++-
 r/man/IterativeLSI.Rd                    | 11 ++-
 r/man/cluster_cells_graph.Rd             |  8 +--
 r/man/cluster_graph.Rd                   | 46 ++----------
 r/man/convert_mat_to_graph.Rd            | 35 ---------
 r/tests/testthat/test-singlecell_utils.R |  6 +-
 8 files changed, 35 insertions(+), 177 deletions(-)
 delete mode 100644 r/man/convert_mat_to_graph.Rd

diff --git a/r/NEWS.md b/r/NEWS.md
index 9c3a0ff5..c695e83b 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -10,7 +10,6 @@ Contributions welcome :)
 
 ## Breaking changes
 - Change first parameter name of `cluster_graph_leiden()`, `cluster_graph_louvain()` and `cluster_graph_seurat()` from `snn` to `mat` to more accurately reflect the input type.  (pull request #189)
-- Added parameters, `knn_method` and `knn_to_graph_method` to `cluster_graph_leiden()`, `cluster_graph_louvain()` and `cluster_graph_seurat()`. (pull request #189)
 
 ## Features
 - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166)
@@ -19,7 +18,7 @@ Contributions welcome :)
 - Add feature selection functions `select_features_variance()`, and `select_features_{dispersion,mean,binned_dispersion}()`, with parameterization for normalization steps, and number of variable features (pull request #189)
 - Add `LSI()` and `IterativeLSI()` dimensionality functions to perform latent semantic indexing on a matrix (pull request #189).
 - Add capability to create partial function objects in when excluding the first argument of a function.  This is implemented in normalizations, feature selections, dimensionality reductions, and clustering functions. See `select_features_variance()` for usage.  (pull request #189)
-- Allowed clustering functions `cluster_graph_leiden()`, `cluster_graph_louvain()`, and `cluster_graph_seurat()` to also perform knn object and graph adjacency intermediate construction steps with `knn_method` and `knn_to_graph_method` parameters. Also provided `threads` and `verbose` arguments to clustering functions that are automatically passed down to knn object/graph adjacency construction steps.  Baseline expectation of usage in clustering functions is changed to having users directly input a cell embedding matrix. (pull request #189)
+- Create a wrapper function `cluster_cells_graph()` that wraps the steps of knn object creation, graph adjacency creation, and clustering all within a single function (pull request #189)
 
 ## Improvements
 - Speed up taking large subsets of large concatenated matrices, e.g. selecting 9M cells from a 10M cell matrix composed of ~100 concatenated pieces. (pull request #179)
diff --git a/r/R/clustering.R b/r/R/clustering.R
index a9d7a237..cb5eebf0 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -28,56 +28,18 @@ is_adjacency_matrix <- function(mat) {
   return(is(mat, "dgCMatrix") && nrow(mat) == ncol(mat))
 }
 
-#' Converts a matrix to a graph adjacency matrix
-#'
-#' Handles inputs so that cell embeddings and knn objects are converted into graph adjacency matrices.
-#' @param mat Input matrix to be converted of shape `(cells x n_embeddings)`
-#' @param knn_method (function) Function to convert input to a knn object (e.g., `knn_hnsw`, `knn_annoy`).
-#'   Ignored if `mat` is already a knn object or graph matrix.
-#' @param knn_to_graph_method (function) Function to convert a knn object to a graph adjacency matrix
-#'   (e.g., `knn_to_geodesic_graph`). Ignored if `mat` is already a graph matrix.
-#' @details
-#' If `mat` is a `(cells x n_embeddings)` matrix then `knn_method` is used to convert `mat` to a knn object.
-#' Knn objects are then converted into a graph adjacency matrix using `knn_to_graph_method`. Also does intermediate checks to ensure 
-#' that `knn_method` and `knn_to_graph` properly converted matrix into a correct output.
-#' @return  Symmetric graph adjacency matrix (dgCMatrix)
-#' @keywords internal
-convert_mat_to_graph <- function(
-  mat,
-  knn_method = knn_hnsw,
-  knn_to_graph_method = knn_to_geodesic_graph,
-  threads = 1L,
-  verbose = FALSE
-) {
-  if (is(mat, "matrix")) {
-    mat <- partial_apply(knn_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
-  }
-  if (!(is_knn_object(mat) || is_adjacency_matrix(mat))) {
-    pretty_error(mat, "must be a knn object, or convertible to one", 1)
-  }
-  # There currently aren't any `knn_to_graph` functions that utilize a verbose argument.  
-  # However, we still pass `verbose` in case future functions do provide this functionality.
-  if (is_knn_object(mat)) {
-    mat <- partial_apply(knn_to_graph_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
-  }
-  if (!is_adjacency_matrix(mat)) {
-    pretty_error(mat, "must be a graph adjacency matrix, or convertible to one", 1)
-  }
-  return(mat)
-}
-
-#' Cluster Embeddings using a kNN-Graph Based Community Algorithm
+#' Cluster Embeddings using a kNN-Graph Based Algorithm
 #' 
 #' Take in a cell embedding matrix, and sequentially convert it into a kNN object, then to
-#' a graph adjacency matrix.  Following, a community detection algorithm assigns a cluster label
+#' a graph adjacency matrix.  Following, a clustering algorithm assigns a label
 #' to every cell.
 #' 
 #' @param mat (matrix) Cell embeddings matrix of shape `(cells x n_embeddings)`
-#' @param knn_method (function) Function to convert input to a knn object (e.g., `knn_hnsw`, `knn_annoy`).
+#' @param knn_method (function) Function to convert cell embeddings into a knn object.
 #' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`. 
 #' @param knn_to_graph_method (function) Function to convert the knn object returned from `knn_method` to a graph adjacency matrix.
 #' Must be a (optionally partialized) version of `knn_to_graph()`, `knn_to_snn_graph()` or `knn_to_geodesic_graph()`.  
-#' @param graph_to_cluster_method (function) Community detection algorithm that converts a graph adjacency matrix 
+#' @param graph_to_cluster_method (function) Clustering algorithm that converts a graph adjacency matrix 
 #' returned from `graph_to_cluster_method` into cluster labels for each cell.
 #' Must be a (optionally partialized) version of `cluster_graph_leiden()`, `cluster_graph_louvain()` or `cluster_graph_seurat()`.
 #' @param threads (integer) Number of threads to use in `knn_method`, `knn_to_graph_method` and `graph_to_cluster_method`.  If these functions do not utilize
@@ -106,14 +68,13 @@ cluster_cells_graph <- function(
   graph_to_cluster_method = cluster_graph_leiden, 
   threads = 0L, verbose = FALSE
 ) {
-  assert_true(is.matrix(mat))
   assert_is_wholenumber(threads)
   assert_is(verbose, "logical")
+  if (rlang::is_missing(mat)) return(create_partial())
+  assert_is(mat, "matrix")
   # There currently aren't any `knn_to_graph` functions that utilize a verbose argument.  
   # However, we still pass `verbose` in case future functions do provide this functionality.
-  if (is(mat, "matrix")) {
-    mat <- partial_apply(knn_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
-  }
+  mat <- partial_apply(knn_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
   if (!is_knn_object(mat)) pretty_error(mat, "`knn_method` was unable to convert `mat` into a knn object", 1)
   # Return type has to be constrained to "matrix", so this is silently provided.
   mat <- partial_apply(knn_to_graph_method, threads = threads, verbose = verbose, return_type = "matrix", .missing_args_error = FALSE)(mat)
@@ -254,44 +215,27 @@ knn_to_geodesic_graph <- function(knn, return_type = c("matrix", "list"), thread
   return(res)
 }
 
-#' Cluster a cell embedding matrix using a graph based algorithm
+#' Cluster an adjacency matrix
 #' @rdname cluster_graph
 #' @details **cluster_graph_leiden**: Leiden clustering algorithm `igraph::cluster_leiden()`. 
 #'    Note that when using `objective_function = "CPM"` the number of clusters empirically scales with `cells * resolution`,
 #'    so 1e-3 is a good resolution for 10k cells, but 1M cells is better with a 1e-5 resolution. A resolution of 1 is a 
 #'    good default when `objective_function = "modularity"` per the default.
-#' @param mat (matrix) `(cells x n_embeddings)` matrix from a dimensionality reduction.
-#' `mat` can also be a knn object (list), with names `idx` and `dist`, returned from a knn method (See `knn_hnsw()`, `knn_annoy()`).
-#' Additionally, `mat` can be a symmetric graph adjacency matrix (dgCMatrix) from e.g. `knn_to_snn_graph()` or `knn_to_geodesic_graph()`. 
-#' Only the lower triangle from a graph adjacency matrix is used.
+#' @param mat Symmetric adjacency matrix (dgCMatrix) output from e.g. `knn_to_snn_graph()` or `knn_to_geodesic_graph()`. Only the lower triangle is used.
 #' @param resolution Resolution parameter. Higher values result in more clusters
 #' @param objective_function Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below). 
 #'    For the meaning of each option, see `igraph::cluster_leiden()`.
-#' @param knn_method (function) Function to convert `mat` from cell embeddings to a knn object.
-#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.  Ignored if `mat` is already a knn object/graph adjacency matrix.
-#' @param knn_to_graph_method (function) Function to convert `mat` from a knn object to a graph adjacency matrix.
-#' Must be a (optionally partialized) version of `knn_to_graph()`, `knn_to_snn_graph()` or `knn_to_geodesic_graph()`.  
-#' Ignored if `mat` is already a graph adjacency matrix.
 #' @param seed Random seed for clustering initialization
-#' @param threads (integer) Number of threads to use in `knn_method` and `knn_to_graph_method`.  If these functions do not utilize
-#' a `threads` argument, this is silently ignored.
-#' @param verbose (logical) Whether to print progress information in `knn_method` and `knn_to_graph_method`.  If these functions do not utilize 
-#' a `verbose` argument, this is silently ignored.
 #' @param ... Additional arguments to underlying clustering function
 #' @return Factor vector containing the cluster assignment for each cell.
 #' @export
 cluster_graph_leiden <- function(
   mat, resolution = 1, objective_function = c("modularity", "CPM"),
-  knn_method = knn_hnsw, knn_to_graph_method = knn_to_geodesic_graph, seed = 12531, threads = 0L, verbose = FALSE, ...
+  seed = 12531, ...
 ) {
   assert_has_package("igraph")
   # Set seed without permanently changing seed state
   if (rlang::is_missing(mat)) return(create_partial())
-  mat <- convert_mat_to_graph(
-    mat, knn_method = knn_method,
-    knn_to_graph_method = knn_to_graph_method, 
-    threads = threads, verbose = verbose
-  )
   prev_seed <- get_seed()
   on.exit(restore_seed(prev_seed), add = TRUE)
   set.seed(seed)
@@ -309,17 +253,11 @@ cluster_graph_leiden <- function(
 #' @details **cluster_graph_louvain**: Louvain graph clustering algorithm `igraph::cluster_louvain()`
 #' @export
 cluster_graph_louvain <- function(
-  mat, resolution = 1, knn_method = knn_hnsw, 
-  knn_to_graph_method = knn_to_geodesic_graph, seed = 12531, threads = 0L, verbose = FALSE
+  mat, resolution = 1, seed = 12531
 ) {
   assert_has_package("igraph")
   # Set seed without permanently changing seed state
   if (rlang::is_missing(mat)) return(create_partial())
-  mat <- convert_mat_to_graph(
-    mat, knn_method = knn_method,
-    knn_to_graph_method = knn_to_graph_method, 
-    threads = threads, verbose = verbose
-  )
 
   prev_seed <- get_seed()
   on.exit(restore_seed(prev_seed), add = TRUE)
@@ -335,16 +273,10 @@ cluster_graph_louvain <- function(
 #' @details **cluster_graph_seurat**: Seurat's clustering algorithm `Seurat::FindClusters()`
 #' @export
 cluster_graph_seurat <- function(
-  mat, resolution = 0.8, knn_method = knn_hnsw, 
-  knn_to_graph_method = knn_to_geodesic_graph, threads = 0L, verbose = FALSE, ...
+  mat, resolution = 0.8, ...
 ) {
   assert_has_package("Seurat")
   if (rlang::is_missing(mat)) return(create_partial())
-  mat <- convert_mat_to_graph(
-    mat, knn_method = knn_method,
-    knn_to_graph_method = knn_to_graph_method, 
-    threads = threads, verbose = verbose
-  )
   Seurat::as.Graph(mat) %>%
     Seurat::FindClusters(resolution = resolution, ...) %>%
     .[[1]]
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 78bb35b2..0cdd176f 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -383,9 +383,8 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @param feature_selection_method (function) Method to use for selecting features for LSI. 
 #' Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_mean`, `select_features_binned_dispersion`
 #' @param cluster_method (function) Method to use for clustering the post-SVD matrix. 
-#' Current builtin options are `cluster_graph_{leiden, louvain, seurat}()`.
 #' The user can pass in partial parameters to the cluster method, such as by passing 
-#' `cluster_graph_leiden(resolution = 0.5, knn_method = knn_hnsw(ef = 500, k = 12), knn_to_graph_method = knn_to_snn_graph(min_val = 0.1))`.
+#' `cluster_cells_graph(mat, graph_to_cluster_method = cluter_graph_louvain(resolution = 0.5))` into `cluster_method`.
 #' @param threads (integer) Number of threads to use.  Also gets passed down into `feature_selection_method` and `cluster_method`
 #' @return 
 #' `IterativeLSI()` An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
@@ -410,7 +409,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #'    - If `n_iterations` is 1, return the projected data from the first PCA projection
 #'    - Else, cluster the LSI results using `cluster_method`
 #' - For each subsequent iteration:
-#'    - Pseudobulk the clusters and select the top features based on the pseudobulked clusters
+#'    - Pseudobulk the clusters from the previous iteration and select the top features based on the pseudobulked clusters
 #'    - Perform LSI on the selected features
 #'    - If this is the final iteration, return the projected data from this PCA projection
 #'    - Else, cluster the LSI results using `cluster_method`
@@ -433,8 +432,8 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' While corr_cutoff is provided as an argument in `IterativeLSI()`, it is set to not removing any PCs by default.
 #' - ArchR filters out outliers dependent on number of accesible regions of cells, by the bottom and top quantiles.  This is not implemented in `IterativeLSI()`, 
 #' but can be done as a preprocessing step.
-#' @seealso `LSI()` `DimReduction()` `svds()` `knn_hnsw()` `knn_annoy()` 
-#' `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` `select_features_variance()` `select_features_dispersion()` 
+#' @seealso `LSI()` `DimReduction()` `svds()`
+#' `cluster_cells_graph` `select_features_variance()` `select_features_dispersion()` 
 #' `select_features_mean()` `select_features_binned_dispersion()`
 #' @inheritParams LSI
 #' @export
@@ -445,7 +444,7 @@ IterativeLSI <- function(
   scale_factor = 1e4, 
   n_dimensions = 50L, 
   corr_cutoff = 1,
-  cluster_method = cluster_graph_leiden,
+  cluster_method = cluster_cells_graph,
   threads = 1L, verbose = FALSE
 ) {
   assert_has_package("RcppHNSW")
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index 0635ea68..414830b6 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -12,7 +12,7 @@ IterativeLSI(
   scale_factor = 10000,
   n_dimensions = 50L,
   corr_cutoff = 1,
-  cluster_method = cluster_graph_leiden,
+  cluster_method = cluster_cells_graph,
   threads = 1L,
   verbose = FALSE
 )
@@ -35,9 +35,8 @@ Current builtin options are \code{select_features_variance}, \code{select_featur
 the corr_cutoff, it will be excluded from the final PCA matrix.}
 
 \item{cluster_method}{(function) Method to use for clustering the post-SVD matrix.
-Current builtin options are \verb{cluster_graph_\{leiden, louvain, seurat\}()}.
 The user can pass in partial parameters to the cluster method, such as by passing
-\code{cluster_graph_leiden(resolution = 0.5, knn_method = knn_hnsw(ef = 500, k = 12), knn_to_graph_method = knn_to_snn_graph(min_val = 0.1))}.}
+\code{cluster_cells_graph(mat, graph_to_cluster_method = cluter_graph_louvain(resolution = 0.5))} into \code{cluster_method}.}
 
 \item{threads}{(integer) Number of threads to use.  Also gets passed down into \code{feature_selection_method} and \code{cluster_method}}
 
@@ -86,7 +85,7 @@ The Iterative LSI method is as follows:
 }
 \item For each subsequent iteration:
 \itemize{
-\item Pseudobulk the clusters and select the top features based on the pseudobulked clusters
+\item Pseudobulk the clusters from the previous iteration and select the top features based on the pseudobulked clusters
 \item Perform LSI on the selected features
 \item If this is the final iteration, return the projected data from this PCA projection
 \item Else, cluster the LSI results using \code{cluster_method}
@@ -115,7 +114,7 @@ but can be done as a preprocessing step.
 }
 }
 \seealso{
-\code{LSI()} \code{DimReduction()} \code{svds()} \code{knn_hnsw()} \code{knn_annoy()}
-\code{cluster_graph_leiden()} \code{cluster_graph_louvain()} \code{cluster_graph_seurat()} \code{select_features_variance()} \code{select_features_dispersion()}
+\code{LSI()} \code{DimReduction()} \code{svds()}
+\code{cluster_cells_graph} \code{select_features_variance()} \code{select_features_dispersion()}
 \code{select_features_mean()} \code{select_features_binned_dispersion()}
 }
diff --git a/r/man/cluster_cells_graph.Rd b/r/man/cluster_cells_graph.Rd
index 1ac69d21..6ca56a28 100644
--- a/r/man/cluster_cells_graph.Rd
+++ b/r/man/cluster_cells_graph.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/clustering.R
 \name{cluster_cells_graph}
 \alias{cluster_cells_graph}
-\title{Cluster Embeddings using a kNN-Graph Based Community Algorithm}
+\title{Cluster Embeddings using a kNN-Graph Based Algorithm}
 \usage{
 cluster_cells_graph(
   mat,
@@ -16,13 +16,13 @@ cluster_cells_graph(
 \arguments{
 \item{mat}{(matrix) Cell embeddings matrix of shape \verb{(cells x n_embeddings)}}
 
-\item{knn_method}{(function) Function to convert input to a knn object (e.g., \code{knn_hnsw}, \code{knn_annoy}).
+\item{knn_method}{(function) Function to convert cell embeddings into a knn object.
 Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.}
 
 \item{knn_to_graph_method}{(function) Function to convert the knn object returned from \code{knn_method} to a graph adjacency matrix.
 Must be a (optionally partialized) version of \code{knn_to_graph()}, \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}.}
 
-\item{graph_to_cluster_method}{(function) Community detection algorithm that converts a graph adjacency matrix
+\item{graph_to_cluster_method}{(function) Clustering algorithm that converts a graph adjacency matrix
 returned from \code{graph_to_cluster_method} into cluster labels for each cell.
 Must be a (optionally partialized) version of \code{cluster_graph_leiden()}, \code{cluster_graph_louvain()} or \code{cluster_graph_seurat()}.}
 
@@ -37,7 +37,7 @@ a \code{verbose} argument, this is silently ignored.}
 }
 \description{
 Take in a cell embedding matrix, and sequentially convert it into a kNN object, then to
-a graph adjacency matrix.  Following, a community detection algorithm assigns a cluster label
+a graph adjacency matrix.  Following, a clustering algorithm assigns a label
 to every cell.
 }
 \details{
diff --git a/r/man/cluster_graph.Rd b/r/man/cluster_graph.Rd
index a7ef707f..0794ca6c 100644
--- a/r/man/cluster_graph.Rd
+++ b/r/man/cluster_graph.Rd
@@ -4,73 +4,37 @@
 \alias{cluster_graph_leiden}
 \alias{cluster_graph_louvain}
 \alias{cluster_graph_seurat}
-\title{Cluster a cell embedding matrix using a graph based algorithm}
+\title{Cluster an adjacency matrix}
 \usage{
 cluster_graph_leiden(
   mat,
   resolution = 1,
   objective_function = c("modularity", "CPM"),
-  knn_method = knn_hnsw,
-  knn_to_graph_method = knn_to_geodesic_graph,
   seed = 12531,
-  threads = 0L,
-  verbose = FALSE,
   ...
 )
 
-cluster_graph_louvain(
-  mat,
-  resolution = 1,
-  knn_method = knn_hnsw,
-  knn_to_graph_method = knn_to_geodesic_graph,
-  seed = 12531,
-  threads = 0L,
-  verbose = FALSE
-)
+cluster_graph_louvain(mat, resolution = 1, seed = 12531)
 
-cluster_graph_seurat(
-  mat,
-  resolution = 0.8,
-  knn_method = knn_hnsw,
-  knn_to_graph_method = knn_to_geodesic_graph,
-  threads = 0L,
-  verbose = FALSE,
-  ...
-)
+cluster_graph_seurat(mat, resolution = 0.8, ...)
 }
 \arguments{
-\item{mat}{(matrix) \verb{(cells x n_embeddings)} matrix from a dimensionality reduction.
-\code{mat} can also be a knn object (list), with names \code{idx} and \code{dist}, returned from a knn method (See \code{knn_hnsw()}, \code{knn_annoy()}).
-Additionally, \code{mat} can be a symmetric graph adjacency matrix (dgCMatrix) from e.g. \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}.
-Only the lower triangle from a graph adjacency matrix is used.}
+\item{mat}{Symmetric adjacency matrix (dgCMatrix) output from e.g. \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}. Only the lower triangle is used.}
 
 \item{resolution}{Resolution parameter. Higher values result in more clusters}
 
 \item{objective_function}{Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below).
 For the meaning of each option, see \code{igraph::cluster_leiden()}.}
 
-\item{knn_method}{(function) Function to convert \code{mat} from cell embeddings to a knn object.
-Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.  Ignored if \code{mat} is already a knn object/graph adjacency matrix.}
-
-\item{knn_to_graph_method}{(function) Function to convert \code{mat} from a knn object to a graph adjacency matrix.
-Must be a (optionally partialized) version of \code{knn_to_graph()}, \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}.
-Ignored if \code{mat} is already a graph adjacency matrix.}
-
 \item{seed}{Random seed for clustering initialization}
 
-\item{threads}{(integer) Number of threads to use in \code{knn_method} and \code{knn_to_graph_method}.  If these functions do not utilize
-a \code{threads} argument, this is silently ignored.}
-
-\item{verbose}{(logical) Whether to print progress information in \code{knn_method} and \code{knn_to_graph_method}.  If these functions do not utilize
-a \code{verbose} argument, this is silently ignored.}
-
 \item{...}{Additional arguments to underlying clustering function}
 }
 \value{
 Factor vector containing the cluster assignment for each cell.
 }
 \description{
-Cluster a cell embedding matrix using a graph based algorithm
+Cluster an adjacency matrix
 }
 \details{
 \strong{cluster_graph_leiden}: Leiden clustering algorithm \code{igraph::cluster_leiden()}.
diff --git a/r/man/convert_mat_to_graph.Rd b/r/man/convert_mat_to_graph.Rd
deleted file mode 100644
index da5cfd38..00000000
--- a/r/man/convert_mat_to_graph.Rd
+++ /dev/null
@@ -1,35 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/clustering.R
-\name{convert_mat_to_graph}
-\alias{convert_mat_to_graph}
-\title{Converts a matrix to a graph adjacency matrix}
-\usage{
-convert_mat_to_graph(
-  mat,
-  knn_method = knn_hnsw,
-  knn_to_graph_method = knn_to_geodesic_graph,
-  threads = 1L,
-  verbose = FALSE
-)
-}
-\arguments{
-\item{mat}{Input matrix to be converted of shape \verb{(cells x n_embeddings)}}
-
-\item{knn_method}{(function) Function to convert input to a knn object (e.g., \code{knn_hnsw}, \code{knn_annoy}).
-Ignored if \code{mat} is already a knn object or graph matrix.}
-
-\item{knn_to_graph_method}{(function) Function to convert a knn object to a graph adjacency matrix
-(e.g., \code{knn_to_geodesic_graph}). Ignored if \code{mat} is already a graph matrix.}
-}
-\value{
-Symmetric graph adjacency matrix (dgCMatrix)
-}
-\description{
-Handles inputs so that cell embeddings and knn objects are converted into graph adjacency matrices.
-}
-\details{
-If \code{mat} is a \verb{(cells x n_embeddings)} matrix then \code{knn_method} is used to convert \code{mat} to a knn object.
-Knn objects are then converted into a graph adjacency matrix using \code{knn_to_graph_method}. Also does intermediate checks to ensure
-that \code{knn_method} and \code{knn_to_graph} properly converted matrix into a correct output.
-}
-\keyword{internal}
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index b21311cc..aa941d3c 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -244,7 +244,7 @@ test_that("Iterative LSI works", {
   mat <- matrix(data = runif(50000, 0, 1), nrow=500, ncol = 100) %>% as("dgCMatrix") %>% as("IterableMatrix")
   rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
   colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
-  lsi_res_obj <- expect_no_error(IterativeLSI(mat, n_iterations = 2, n_dimensions = 10L, cluster_method = cluster_graph_louvain(knn_method = knn_hnsw)))
+  lsi_res_obj <- expect_no_error(IterativeLSI(mat, n_iterations = 2, n_dimensions = 10L, cluster_method = cluster_cells_graph(knn_method = knn_annoy)))
   lsi_res_proj <- project(lsi_res_obj, mat)
   lsi_res_proj_iter_1 <- expect_no_error(project(lsi_res_obj, mat, iteration = 1L))
   lsi_res_embedding <- lsi_res_obj$cell_embeddings
@@ -262,8 +262,8 @@ test_that("Iterative LSI works with parameterized clustering", {
   colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
   lsi_res_obj <- expect_no_error(
     IterativeLSI(
-      mat, n_dimensions = 10L,,
-      cluster_method = cluster_graph_leiden(
+      mat, n_dimensions = 10L,
+      cluster_method = cluster_cells_graph(
         knn_method = knn_annoy(k = 12),
         knn_to_graph_method = knn_to_snn_graph(min_val = 0.1)
       )

From 701f7b4935361ac17387379f0b8f4f3fb28d5600 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 5 Mar 2025 15:10:23 -0800
Subject: [PATCH 134/142] [r] cleanup mat handling during `LSI()`

---
 r/R/singlecell_utils.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 0cdd176f..b122fe54 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -310,10 +310,12 @@ LSI <- function(
     threads = threads
   )
   # Save to prevent repeating queued calculations during SVD
+  temp_mat_dir <- tempfile("mat")
   mat <- write_matrix_dir(
     convert_matrix_type(mat, type = "float"),
-    tempfile("mat"), compress = TRUE
+    temp_mat_dir, compress = TRUE
   )
+  on.exit(unlink(temp_mat_dir, recursive = TRUE, expand = FALSE))
   # Run pca
   if (verbose) log_progress("Calculating SVD")
   svd_attr <- svds(mat, k = n_dimensions, threads = threads)

From 89ce4c055c84692a90c5a19be4c40d8abc5ebd64 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 5 Mar 2025 15:10:38 -0800
Subject: [PATCH 135/142] [r] cleanup `cluster_cells_graph()` title

---
 r/R/clustering.R | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/r/R/clustering.R b/r/R/clustering.R
index cb5eebf0..797048cc 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -28,11 +28,10 @@ is_adjacency_matrix <- function(mat) {
   return(is(mat, "dgCMatrix") && nrow(mat) == ncol(mat))
 }
 
-#' Cluster Embeddings using a kNN-Graph Based Algorithm
+#' Cluster embeddings using a KNN-Graph based algorithm
 #' 
 #' Take in a cell embedding matrix, and sequentially convert it into a kNN object, then to
-#' a graph adjacency matrix.  Following, a clustering algorithm assigns a label
-#' to every cell.
+#' a graph adjacency matrix.  Following, assign a label to every cell using a clustering algorithm.
 #' 
 #' @param mat (matrix) Cell embeddings matrix of shape `(cells x n_embeddings)`
 #' @param knn_method (function) Function to convert cell embeddings into a knn object.

From e16890fbf5955e8fe570367081ef55a0f73e785f Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 19 Mar 2025 12:16:00 -0700
Subject: [PATCH 136/142] [r] remove partialized binarization, clean
 normalization documentation

---
 r/R/transforms.R   | 24 +++++++++++++-----------
 r/man/binarize.Rd  |  9 ++-------
 r/man/normalize.Rd | 14 +++++++++++---
 3 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/r/R/transforms.R b/r/R/transforms.R
index 15d8c388..90667eff 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -294,21 +294,16 @@ setMethod("short_description", "TransformBinarize", function(x) {
 #'   comparison to the threshold is >= (strict_inequality=FALSE)
 #'   or > (strict_inequality=TRUE).
 #' @return binarized IterableMatrix object
-#' @description 
-#' If the `mat` argument is missing, returns a "partial" function: a copy of the original function but with most arguments pre-defined. 
-#' This can be used to customize `normalize` parameters in other single cell functions in BPCells (e.g. `select_features_mean()`).
 #' @examples
 #' set.seed(12345)
 #' mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
 #' mat
 #' mat <- as(mat, "IterableMatrix")
+#' 
+#' 
 #' mat_binarized <- binarize(mat, threshold=1)
 #' mat_binarized
 #' as(mat_binarized, "dgCMatrix")
-#' 
-#' # We can also call as a partialized function
-#' partial_binarize <- binarize(threshold = 0.1)
-#' partial_binarize(mat)
 #' @export
 binarize <- function(mat, threshold = 0, strict_inequality = TRUE) {
   assert_is(threshold, "numeric")
@@ -318,7 +313,6 @@ binarize <- function(mat, threshold = 0, strict_inequality = TRUE) {
       stop("binarize threshold must be greater than or equal to zero when strict_inequality is TRUE")
   if (!strict_inequality && threshold <= 0)
      stop("binarize threshold must be greater than zero when strict_inequality is FALSE")
-  if (rlang::is_missing(mat)) return(create_partial())
   assert_is(mat, "IterableMatrix")
 
   res <- wrapMatrix("TransformBinarize",
@@ -972,13 +966,18 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #' mat
 #' 
 #' mat <- as(mat, "IterableMatrix")
+#' 
+#' 
+#' #######################################################################
+#' ## normalize_log() example
 #' normalize_log(mat)
 #' 
-#' # normalization functions can also be called with partial arguments 
+#' ## normalization functions can also be called with partial arguments 
 #' partial_log <- normalize_log(scale_factor = 1e5)
 #' partial_log
 #' 
 #' partial_log(mat)
+#' #######################################################################
 #' @export
 normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
   assert_greater_than_zero(scale_factor)
@@ -993,12 +992,15 @@ normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
 
 #' @rdname normalize
 #' @param feature_means (numeric, optional) Pre-calculated means of the features to normalize by (`rowMeans(mat)` by default). 
-#' If feature_means has names and mat has row names, match values by name. 
-#' Otherwise, assume feature_means has the same length and ordering as the matrix rows.
+#' If `feature_means` has names and `mat` has row names, match values by name. 
+#' Otherwise, assume `feature_means` has the same length and ordering as the matrix rows.
 #' @returns - `normalize_tfidf`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
 #' @details - `normalize_tfidf`: This follows the formula from Stuart, Butler et al. 2019, matching the default behavior of `Signac::RunTFIDF()`. This also matches the normalization used within `ArchR::addIterativeLSI()`, but with `binarize = FALSE`. 
 #' @examples 
+#' #######################################################################
+#' ## normalize_tfidf() example
 #' normalize_tfidf(mat)
+#' #######################################################################
 #' @export
 normalize_tfidf <- function(
   mat, feature_means = NULL,
diff --git a/r/man/binarize.Rd b/r/man/binarize.Rd
index fd8f2f2a..acb9fefb 100644
--- a/r/man/binarize.Rd
+++ b/r/man/binarize.Rd
@@ -27,20 +27,15 @@ are set to one; otherwise, set to zero. When strict_inequality
 is set to FALSE, element values greater than or equal to the
 threshold are set to one. As an alternative, the \code{<}, \code{<=}, \code{>},
 and \code{>=} operators are also supported.
-
-If the \code{mat} argument is missing, returns a "partial" function: a copy of the original function but with most arguments pre-defined.
-This can be used to customize \code{normalize} parameters in other single cell functions in BPCells (e.g. \code{select_features_mean()}).
 }
 \examples{
 set.seed(12345)
 mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
 mat
 mat <- as(mat, "IterableMatrix")
+
+
 mat_binarized <- binarize(mat, threshold=1)
 mat_binarized
 as(mat_binarized, "dgCMatrix")
-
-# We can also call as a partialized function
-partial_binarize <- binarize(threshold = 0.1)
-partial_binarize(mat)
 }
diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd
index d9a78a2c..296e9222 100644
--- a/r/man/normalize.Rd
+++ b/r/man/normalize.Rd
@@ -17,8 +17,8 @@ normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L)
 \item{threads}{(integer) Number of threads to use.}
 
 \item{feature_means}{(numeric, optional) Pre-calculated means of the features to normalize by (\code{rowMeans(mat)} by default).
-If feature_means has names and mat has row names, match values by name.
-Otherwise, assume feature_means has the same length and ordering as the matrix rows.}
+If \code{feature_means} has names and \code{mat} has row names, match values by name.
+Otherwise, assume \code{feature_means} has the same length and ordering as the matrix rows.}
 }
 \value{
 Consider a matrix \eqn{X}, where the row index \eqn{i} refers to each feature
@@ -56,12 +56,20 @@ mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
 mat
 
 mat <- as(mat, "IterableMatrix")
+
+
+#######################################################################
+## normalize_log() example
 normalize_log(mat)
 
-# normalization functions can also be called with partial arguments 
+## normalization functions can also be called with partial arguments 
 partial_log <- normalize_log(scale_factor = 1e5)
 partial_log
 
 partial_log(mat)
+#######################################################################
+#######################################################################
+## normalize_tfidf() example
 normalize_tfidf(mat)
+#######################################################################
 }

From f5d0129d4fd46299eeb3839915414dd26550c72b Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 19 Mar 2025 12:17:46 -0700
Subject: [PATCH 137/142] [r] clean up feature selection documentation, add
 additional dim check on IterativeLSI

---
 r/R/singlecell_utils.R       | 18 +++++++++++++-----
 r/man/IterativeLSI.Rd        |  4 ++--
 r/man/cluster_cells_graph.Rd |  5 ++---
 r/man/feature_selection.Rd   |  4 ++--
 4 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index b122fe54..2f149389 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -19,8 +19,8 @@
 #' an output dataframe is returned, indicating which features are highly variable, and the scoring of each feature.
 #' @rdname feature_selection
 #' @param mat (IterableMatrix) Counts matrix with dimensions `(features x cells)`.
-#' @param num_feats (float) Number of features to mark as highly_variable. If 0 < num_feats < 1, then interpret it as a fraction of features.
-#' @param normalize_method (function) Used to normalize the matrix prior to feature selection by calling `normalize_method(mat)` if it is not NULL. 
+#' @param num_feats (float) Number of features to mark as highly_variable. If 0 < `num_feats` < 1, then interpret as a fraction of features.
+#' @param normalize_method (function) Used to normalize the matrix prior to feature selection by calling `normalize_method(mat)` if it is not `NULL`. 
 #' For example, pass `normalize_log()` or `normalize_tfidf()`. 
 #' @param threads (integer) Number of threads to use. Also overrides the threads argument in `normalize_method`
 #' @returns
@@ -386,7 +386,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_mean`, `select_features_binned_dispersion`
 #' @param cluster_method (function) Method to use for clustering the post-SVD matrix. 
 #' The user can pass in partial parameters to the cluster method, such as by passing 
-#' `cluster_cells_graph(mat, graph_to_cluster_method = cluter_graph_louvain(resolution = 0.5))` into `cluster_method`.
+#' `cluster_cells_graph(mat, graph_to_cluster_method = cluster_graph_louvain(resolution = 0.5))` into `cluster_method`.
 #' @param threads (integer) Number of threads to use.  Also gets passed down into `feature_selection_method` and `cluster_method`
 #' @return 
 #' `IterativeLSI()` An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
@@ -435,7 +435,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' - ArchR filters out outliers dependent on number of accesible regions of cells, by the bottom and top quantiles.  This is not implemented in `IterativeLSI()`, 
 #' but can be done as a preprocessing step.
 #' @seealso `LSI()` `DimReduction()` `svds()`
-#' `cluster_cells_graph` `select_features_variance()` `select_features_dispersion()` 
+#' `cluster_cells_graph()` `select_features_variance()` `select_features_dispersion()` 
 #' `select_features_mean()` `select_features_binned_dispersion()`
 #' @inheritParams LSI
 #' @export
@@ -472,7 +472,6 @@ IterativeLSI <- function(
   feature_selection_method <- partial_apply(feature_selection_method, threads = threads, .missing_args_error = FALSE)
   if (verbose) log_progress("Starting Iterative LSI")
   for (i in seq_len(n_iterations)) {
-    
     if (verbose) log_progress(sprintf("Starting Iterative LSI iteration %s of %s", i, n_iterations))
     # run variable feature selection
     if (verbose) log_progress("Selecting features")
@@ -487,6 +486,15 @@ IterativeLSI <- function(
     } else {
       mat_indices <- variable_features
     }
+    if (length(mat_indices) < n_dimensions) {
+      rlang::abort(
+        sprintf(paste0(
+          "Number of variable features selected (%s) is smaller than number of dimensions requested (%s). \n",
+          "Try setting the num_feats arg in feature_selection_method as a larger value."), 
+          length(mat_indices), n_dimensions
+        )
+      )
+    }
     # run LSI
     if (verbose) log_progress("Running LSI")
 
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index 414830b6..e79bc87c 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -36,7 +36,7 @@ the corr_cutoff, it will be excluded from the final PCA matrix.}
 
 \item{cluster_method}{(function) Method to use for clustering the post-SVD matrix.
 The user can pass in partial parameters to the cluster method, such as by passing
-\code{cluster_cells_graph(mat, graph_to_cluster_method = cluter_graph_louvain(resolution = 0.5))} into \code{cluster_method}.}
+\code{cluster_cells_graph(mat, graph_to_cluster_method = cluster_graph_louvain(resolution = 0.5))} into \code{cluster_method}.}
 
 \item{threads}{(integer) Number of threads to use.  Also gets passed down into \code{feature_selection_method} and \code{cluster_method}}
 
@@ -115,6 +115,6 @@ but can be done as a preprocessing step.
 }
 \seealso{
 \code{LSI()} \code{DimReduction()} \code{svds()}
-\code{cluster_cells_graph} \code{select_features_variance()} \code{select_features_dispersion()}
+\code{cluster_cells_graph()} \code{select_features_variance()} \code{select_features_dispersion()}
 \code{select_features_mean()} \code{select_features_binned_dispersion()}
 }
diff --git a/r/man/cluster_cells_graph.Rd b/r/man/cluster_cells_graph.Rd
index 6ca56a28..b99ff5d6 100644
--- a/r/man/cluster_cells_graph.Rd
+++ b/r/man/cluster_cells_graph.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/clustering.R
 \name{cluster_cells_graph}
 \alias{cluster_cells_graph}
-\title{Cluster Embeddings using a kNN-Graph Based Algorithm}
+\title{Cluster embeddings using a KNN-Graph based algorithm}
 \usage{
 cluster_cells_graph(
   mat,
@@ -37,8 +37,7 @@ a \code{verbose} argument, this is silently ignored.}
 }
 \description{
 Take in a cell embedding matrix, and sequentially convert it into a kNN object, then to
-a graph adjacency matrix.  Following, a clustering algorithm assigns a label
-to every cell.
+a graph adjacency matrix.  Following, assign a label to every cell using a clustering algorithm.
 }
 \details{
 \code{cluster_cells_graph()} acts as a helper function to wrap input creation and \code{kNN} graph adjacency-based clustering to be done together.  The user
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index 62edcc3b..543e6a2a 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -38,9 +38,9 @@ select_features_binned_dispersion(
 \arguments{
 \item{mat}{(IterableMatrix) Counts matrix with dimensions \verb{(features x cells)}.}
 
-\item{num_feats}{(float) Number of features to mark as highly_variable. If 0 < num_feats < 1, then interpret it as a fraction of features.}
+\item{num_feats}{(float) Number of features to mark as highly_variable. If 0 < \code{num_feats} < 1, then interpret as a fraction of features.}
 
-\item{normalize_method}{(function) Used to normalize the matrix prior to feature selection by calling \code{normalize_method(mat)} if it is not NULL.
+\item{normalize_method}{(function) Used to normalize the matrix prior to feature selection by calling \code{normalize_method(mat)} if it is not \code{NULL}.
 For example, pass \code{normalize_log()} or \code{normalize_tfidf()}.}
 
 \item{threads}{(integer) Number of threads to use. Also overrides the threads argument in \code{normalize_method}}

From 75017f2461a92c9c38dbf190463e1fa1e034578c Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 7 Apr 2025 13:37:36 -0700
Subject: [PATCH 138/142] [r] add initial examples for `DimReduction`
 subclasses export `cluster_cells_graph()`

---
 r/NAMESPACE                |  1 +
 r/R/clustering.R           |  1 +
 r/R/singlecell_utils.R     | 84 ++++++++++++++++++++++++++++++++++++++
 r/R/transforms.R           |  2 +-
 r/man/IterativeLSI.Rd      | 32 +++++++++++++++
 r/man/LSI.Rd               | 17 ++++++++
 r/man/feature_selection.Rd | 31 ++++++++++++++
 r/man/normalize.Rd         |  2 +-
 8 files changed, 168 insertions(+), 2 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 9cfed020..a4289b17 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -26,6 +26,7 @@ export(canonical_gene_symbol)
 export(cellNames)
 export(checksum)
 export(chrNames)
+export(cluster_cells_graph)
 export(cluster_graph_leiden)
 export(cluster_graph_louvain)
 export(cluster_graph_seurat)
diff --git a/r/R/clustering.R b/r/R/clustering.R
index 797048cc..f032800f 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -61,6 +61,7 @@ is_adjacency_matrix <- function(mat) {
 #' they can instead call `cluster_cells_graph()` like so: 
 #' `cluster_cells_graph(mat, graph_to_cluster_method = cluter_graph_louvain(resolution = 0.5))`
 #' @seealso `knn_hnsw()` `knn_annoy()` `knn_to_graph()` `knn_to_snn_graph()` `knn_to_geodesic_graph()` `cluster_graph_leiden()` `knn_to_snn_graph()` `knn_to_geodesic_graph()`
+#' @export
 cluster_cells_graph <- function(
   mat, knn_method = knn_hnsw, 
   knn_to_graph_method = knn_to_geodesic_graph, 
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 2f149389..41192c70 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -34,11 +34,17 @@
 #' and the column index \eqn{j} refers to each cell. For each feature \eqn{x_{i} \in X}, we define the following feature-selection scores:
 #' - `select_features_variance`: \eqn{\mathrm{Score}(x_i) = \frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}
 #' @examples
+#' 
+#' ## Prep data
 #' set.seed(12345)
 #' mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
 #' rownames(mat) <- paste0("gene", seq_len(nrow(mat)))
 #' mat
 #' mat <- as(mat, "IterableMatrix")
+#' 
+#' 
+#' #######################################################################
+#' ## select_features_variance() examples
 #' select_features_variance(
 #'     mat, 
 #'     num_feats=2, 
@@ -54,6 +60,7 @@
 #' ) 
 #' # One can then filter to only variable features using the subset operator:
 #' mat[variable_features$feature[variable_features$highly_variable],]
+#' #######################################################################
 #' @seealso `normalize_tfidf()` `normalize_log()`
 #' @export
 select_features_variance <- function(
@@ -85,6 +92,15 @@ select_features_variance <- function(
 #' @rdname feature_selection
 #' @returns
 #' - `select_features_dispersion`: \eqn{\mathrm{Score}(x_i) = \frac{\frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}{\bar{x}_i}}
+#' @examples
+#' #######################################################################
+#' ## select_features_dispersion() example
+#' select_features_dispersion(
+#'   mat,
+#'   num_feats = 2,
+#'   normalize_method = normalize_log
+#' )
+#' #######################################################################
 #' @export
 select_features_dispersion <- function(
   mat, num_feats = 0.05, 
@@ -115,6 +131,15 @@ select_features_dispersion <- function(
 #' @rdname feature_selection
 #' @returns
 #' - `select_features_mean`: \eqn{\mathrm{Score}(x_i) = \frac{\sum_{j=1}^{n}\bigl(x_{ij}\bigr)}{n}}
+#' @examples
+#' #######################################################################
+#' ## select_features_mean() example
+#' select_features_mean(
+#'   mat,
+#'   num_feats = 2,
+#'   normalize_method = normalize_log
+#' )
+#' #######################################################################
 #' @export
 select_features_mean <- function(mat, num_feats = 0.05, normalize_method = NULL, threads = 1L) {
   assert_greater_than_zero(num_feats)
@@ -150,6 +175,15 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize_method = NULL,
 #' 
 #' This should be equivalent to `Seurat::FindVariableFeatures()` with `selection.method="mean.var.plot"`
 #'  and `scanpy.pp.highly_variable_genes()` with `flavor="seurat"`.
+#' @examples
+#' #######################################################################
+#' ## select_features_binned_dispersion() example
+#' select_features_binned_dispersion(
+#'   mat,
+#'   num_feats = 2
+#'   n_bins = 2
+#' )
+#' #######################################################################
 #' @export
 select_features_binned_dispersion <- function(
   mat, num_feats = 0.05, n_bins = 20,
@@ -281,6 +315,19 @@ project.default <- function(x, mat, ...) {
 #' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
 #' - 17.1 MB memory usage, 25.1 seconds runtime
 #' @seealso `project()` `DimReduction()` `normalize_tfidf()` `normalize_log()` `svds()`
+#' @examples
+## Prep data
+#' nrows <- 50
+#' ncols <- 1000
+#' mat <- matrix(1:(nrows*ncols), nrow = nrows) %>% as("IterableMatrix")
+#' rownames(mat) <- paste0("feat", seq(nrows))
+#' colnames(mat) <- paste0("cell", seq(ncols))
+#' 
+#' 
+#' #######################################################################
+#' ## LSI() example
+#' lsi_result <- LSI(mat, n_dimensions = 10)
+#' #######################################################################
 #' @export
 LSI <- function(
   mat, n_dimensions = 50L, corr_cutoff = 1, scale_factor = 1e4,
@@ -347,6 +394,11 @@ LSI <- function(
 #' @return 
 #' `project()` IterableMatrix of the projected data of shape `(n_dimensions, ncol(mat))`.
 #' @inheritParams project
+#' @examples
+#' #######################################################################
+#' ## project(<LSI>) example
+#' dim(project(lsi_result, mat))
+#' #######################################################################
 #' @export
 project.LSI <- function(x, mat, threads = 1L, ...) {
   assert_is_mat(mat)
@@ -438,6 +490,33 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' `cluster_cells_graph()` `select_features_variance()` `select_features_dispersion()` 
 #' `select_features_mean()` `select_features_binned_dispersion()`
 #' @inheritParams LSI
+#' @examples
+#' ## Prep data
+#' nrows <- 500
+#' ncols <- 10000
+#' mat <- matrix(1:(nrows*ncols), nrow = nrows) %>% as("IterableMatrix")
+#' rownames(mat) <- paste0("feat", seq(nrows))
+#' colnames(mat) <- paste0("cell", seq(ncols))
+#' 
+#' 
+#' #######################################################################
+#' ## IterativeLSI() examples
+#' dim_reduction <- IterativeLSI(mat, n_dimensions = 5)
+#' 
+#' ## Can customize parameters using partialization
+#' dim_reduction <- IterativeLSI(
+#'   mat,
+#'   n_dimensions = 10,
+#'   feature_selection_method = select_features_variance(
+#'     num_feats = 0.5,
+#'     normalize_method = normalize_tfidf(scale_factor = 5000)
+#'   ),
+#'   cluster_method = cluster_cells_graph(
+#'     graph_to_cluster_method = cluster_graph_louvain(resolution = 0.5),
+#'     knn_to_graph_method = knn_to_snn_graph
+#'   )
+#' )
+#' #######################################################################
 #' @export
 IterativeLSI <- function(
   mat, 
@@ -539,6 +618,11 @@ IterativeLSI <- function(
 #' @return
 #' `project()` Matrix of the projected data of shape `(cells, n_dimensions)`.
 #' @inheritParams project
+#' @examples
+#' #######################################################################
+#' ## project(<IterativeLSI>) example
+#' dim(project(dim_reduction, mat))
+#' #######################################################################
 #' @export
 project.IterativeLSI <- function(x, mat, iteration = x$fitted_params$iterations, threads = 1L, ...) {
   assert_is_mat(mat)
diff --git a/r/R/transforms.R b/r/R/transforms.R
index 90667eff..b7f44e6f 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -969,7 +969,7 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #' 
 #' 
 #' #######################################################################
-#' ## normalize_log() example
+#' ## normalize_log() examples
 #' normalize_log(mat)
 #' 
 #' ## normalization functions can also be called with partial arguments 
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index e79bc87c..3ba75ef8 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -113,6 +113,38 @@ While corr_cutoff is provided as an argument in \code{IterativeLSI()}, it is set
 but can be done as a preprocessing step.
 }
 }
+\examples{
+## Prep data
+nrows <- 500
+ncols <- 10000
+mat <- matrix(1:(nrows*ncols), nrow = nrows) \%>\% as("IterableMatrix")
+rownames(mat) <- paste0("feat", seq(nrows))
+colnames(mat) <- paste0("cell", seq(ncols))
+
+
+#######################################################################
+## IterativeLSI() examples
+dim_reduction <- IterativeLSI(mat, n_dimensions = 5)
+
+## Can customize parameters using partialization
+dim_reduction <- IterativeLSI(
+  mat,
+  n_dimensions = 10,
+  feature_selection_method = select_features_variance(
+    num_feats = 0.5,
+    normalize_method = normalize_tfidf(scale_factor = 5000)
+  ),
+  cluster_method = cluster_cells_graph(
+    graph_to_cluster_method = cluster_graph_louvain(resolution = 0.5),
+    knn_to_graph_method = knn_to_snn_graph
+  )
+)
+#######################################################################
+#######################################################################
+## project(<IterativeLSI>) example
+dim(project(dim_reduction, mat))
+#######################################################################
+}
 \seealso{
 \code{LSI()} \code{DimReduction()} \code{svds()}
 \code{cluster_cells_graph()} \code{select_features_variance()} \code{select_features_dispersion()}
diff --git a/r/man/LSI.Rd b/r/man/LSI.Rd
index 2210f081..053c1f57 100644
--- a/r/man/LSI.Rd
+++ b/r/man/LSI.Rd
@@ -59,6 +59,23 @@ Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
 \item 17.1 MB memory usage, 25.1 seconds runtime
 }
 }
+\examples{
+nrows <- 50
+ncols <- 1000
+mat <- matrix(1:(nrows*ncols), nrow = nrows) \%>\% as("IterableMatrix")
+rownames(mat) <- paste0("feat", seq(nrows))
+colnames(mat) <- paste0("cell", seq(ncols))
+
+
+#######################################################################
+## LSI() example
+lsi_result <- LSI(mat, n_dimensions = 10)
+#######################################################################
+#######################################################################
+## project(<LSI>) example
+dim(project(lsi_result, mat))
+#######################################################################
+}
 \seealso{
 \code{project()} \code{DimReduction()} \code{normalize_tfidf()} \code{normalize_log()} \code{svds()}
 }
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index 543e6a2a..9b4d165b 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -94,11 +94,17 @@ This should be equivalent to \code{Seurat::FindVariableFeatures()} with \code{se
 and \code{scanpy.pp.highly_variable_genes()} with \code{flavor="seurat"}.
 }
 \examples{
+
+## Prep data
 set.seed(12345)
 mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
 rownames(mat) <- paste0("gene", seq_len(nrow(mat)))
 mat
 mat <- as(mat, "IterableMatrix")
+
+
+#######################################################################
+## select_features_variance() examples
 select_features_variance(
     mat, 
     num_feats=2, 
@@ -114,6 +120,31 @@ variable_features <- select_features_variance(
 ) 
 # One can then filter to only variable features using the subset operator:
 mat[variable_features$feature[variable_features$highly_variable],]
+#######################################################################
+#######################################################################
+## select_features_dispersion() example
+select_features_dispersion(
+  mat,
+  num_feats = 2,
+  normalize_method = normalize_log
+)
+#######################################################################
+#######################################################################
+## select_features_mean() example
+select_features_mean(
+  mat,
+  num_feats = 2,
+  normalize_method = normalize_log
+)
+#######################################################################
+#######################################################################
+## select_features_binned_dispersion() example
+select_features_binned_dispersion(
+  mat,
+  num_feats = 2
+  n_bins = 2
+)
+#######################################################################
 }
 \seealso{
 \code{normalize_tfidf()} \code{normalize_log()}
diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd
index 296e9222..3eac4a60 100644
--- a/r/man/normalize.Rd
+++ b/r/man/normalize.Rd
@@ -59,7 +59,7 @@ mat <- as(mat, "IterableMatrix")
 
 
 #######################################################################
-## normalize_log() example
+## normalize_log() examples
 normalize_log(mat)
 
 ## normalization functions can also be called with partial arguments 

From 03ed4f96197c5168c91b760bff885b25c5abbe15 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 10 Apr 2025 14:00:08 -0700
Subject: [PATCH 139/142] [r] add example updates to `LSI()` `IterativeLSI()`
 `feature_selection` and `normalize`

---
 r/R/singlecell_utils.R     | 42 ++++++++++++++++++++++++++------------
 r/R/transforms.R           |  8 ++++++--
 r/man/IterativeLSI.Rd      |  8 ++++++--
 r/man/LSI.Rd               |  8 ++++++--
 r/man/feature_selection.Rd | 26 +++++++++++++++--------
 r/man/normalize.Rd         |  8 ++++++--
 6 files changed, 70 insertions(+), 30 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 41192c70..e9a5aa9b 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -34,10 +34,9 @@
 #' and the column index \eqn{j} refers to each cell. For each feature \eqn{x_{i} \in X}, we define the following feature-selection scores:
 #' - `select_features_variance`: \eqn{\mathrm{Score}(x_i) = \frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}
 #' @examples
-#' 
 #' ## Prep data
 #' set.seed(12345)
-#' mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
+#' mat <- matrix(rpois(5*8, lambda=1), nrow=5, ncol=8)
 #' rownames(mat) <- paste0("gene", seq_len(nrow(mat)))
 #' mat
 #' mat <- as(mat, "IterableMatrix")
@@ -45,6 +44,7 @@
 #' 
 #' #######################################################################
 #' ## select_features_variance() examples
+#' #######################################################################
 #' select_features_variance(
 #'     mat, 
 #'     num_feats=2, 
@@ -52,7 +52,8 @@
 #' )
 #' 
 #' # Because of how the BPCells `normalize` functions behave when the matrix 
-#' # argument is missing, we can also customize the normalization parameters using partial arguments:
+#' # argument is missing, we can also customize the normalization parameters 
+#' # using partial arguments:
 #' variable_features <- select_features_variance(
 #'     mat,
 #'     num_feats=2,
@@ -60,7 +61,8 @@
 #' ) 
 #' # One can then filter to only variable features using the subset operator:
 #' mat[variable_features$feature[variable_features$highly_variable],]
-#' #######################################################################
+#' 
+#' 
 #' @seealso `normalize_tfidf()` `normalize_log()`
 #' @export
 select_features_variance <- function(
@@ -95,12 +97,14 @@ select_features_variance <- function(
 #' @examples
 #' #######################################################################
 #' ## select_features_dispersion() example
+#' #######################################################################
 #' select_features_dispersion(
 #'   mat,
 #'   num_feats = 2,
 #'   normalize_method = normalize_log
 #' )
-#' #######################################################################
+#' 
+#' 
 #' @export
 select_features_dispersion <- function(
   mat, num_feats = 0.05, 
@@ -134,12 +138,14 @@ select_features_dispersion <- function(
 #' @examples
 #' #######################################################################
 #' ## select_features_mean() example
+#' #######################################################################
 #' select_features_mean(
 #'   mat,
-#'   num_feats = 2,
+#'   num_feats = 1,
 #'   normalize_method = normalize_log
 #' )
-#' #######################################################################
+#' 
+#' 
 #' @export
 select_features_mean <- function(mat, num_feats = 0.05, normalize_method = NULL, threads = 1L) {
   assert_greater_than_zero(num_feats)
@@ -178,12 +184,14 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize_method = NULL,
 #' @examples
 #' #######################################################################
 #' ## select_features_binned_dispersion() example
+#' #######################################################################
 #' select_features_binned_dispersion(
 #'   mat,
-#'   num_feats = 2
+#'   num_feats = 2,
 #'   n_bins = 2
 #' )
-#' #######################################################################
+#' 
+#' 
 #' @export
 select_features_binned_dispersion <- function(
   mat, num_feats = 0.05, n_bins = 20,
@@ -326,8 +334,10 @@ project.default <- function(x, mat, ...) {
 #' 
 #' #######################################################################
 #' ## LSI() example
-#' lsi_result <- LSI(mat, n_dimensions = 10)
 #' #######################################################################
+#' lsi_result <- LSI(mat, n_dimensions = 10)
+#' 
+#' 
 #' @export
 LSI <- function(
   mat, n_dimensions = 50L, corr_cutoff = 1, scale_factor = 1e4,
@@ -397,8 +407,10 @@ LSI <- function(
 #' @examples
 #' #######################################################################
 #' ## project(<LSI>) example
-#' dim(project(lsi_result, mat))
 #' #######################################################################
+#' dim(project(lsi_result, mat))
+#' 
+#' 
 #' @export
 project.LSI <- function(x, mat, threads = 1L, ...) {
   assert_is_mat(mat)
@@ -501,6 +513,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' 
 #' #######################################################################
 #' ## IterativeLSI() examples
+#' #######################################################################
 #' dim_reduction <- IterativeLSI(mat, n_dimensions = 5)
 #' 
 #' ## Can customize parameters using partialization
@@ -516,7 +529,8 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #'     knn_to_graph_method = knn_to_snn_graph
 #'   )
 #' )
-#' #######################################################################
+#' 
+#' 
 #' @export
 IterativeLSI <- function(
   mat, 
@@ -621,8 +635,10 @@ IterativeLSI <- function(
 #' @examples
 #' #######################################################################
 #' ## project(<IterativeLSI>) example
-#' dim(project(dim_reduction, mat))
 #' #######################################################################
+#' dim(project(dim_reduction, mat))
+#' 
+#' 
 #' @export
 project.IterativeLSI <- function(x, mat, iteration = x$fitted_params$iterations, threads = 1L, ...) {
   assert_is_mat(mat)
diff --git a/r/R/transforms.R b/r/R/transforms.R
index b7f44e6f..4a34ff35 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -970,6 +970,7 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #' 
 #' #######################################################################
 #' ## normalize_log() examples
+#' #######################################################################
 #' normalize_log(mat)
 #' 
 #' ## normalization functions can also be called with partial arguments 
@@ -977,7 +978,8 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
 #' partial_log
 #' 
 #' partial_log(mat)
-#' #######################################################################
+#' 
+#' 
 #' @export
 normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
   assert_greater_than_zero(scale_factor)
@@ -999,8 +1001,10 @@ normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
 #' @examples 
 #' #######################################################################
 #' ## normalize_tfidf() example
-#' normalize_tfidf(mat)
 #' #######################################################################
+#' normalize_tfidf(mat)
+#' 
+#' 
 #' @export
 normalize_tfidf <- function(
   mat, feature_means = NULL,
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index 3ba75ef8..c23041a7 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -124,6 +124,7 @@ colnames(mat) <- paste0("cell", seq(ncols))
 
 #######################################################################
 ## IterativeLSI() examples
+#######################################################################
 dim_reduction <- IterativeLSI(mat, n_dimensions = 5)
 
 ## Can customize parameters using partialization
@@ -139,11 +140,14 @@ dim_reduction <- IterativeLSI(
     knn_to_graph_method = knn_to_snn_graph
   )
 )
-#######################################################################
+
+
 #######################################################################
 ## project(<IterativeLSI>) example
-dim(project(dim_reduction, mat))
 #######################################################################
+dim(project(dim_reduction, mat))
+
+
 }
 \seealso{
 \code{LSI()} \code{DimReduction()} \code{svds()}
diff --git a/r/man/LSI.Rd b/r/man/LSI.Rd
index 053c1f57..99837048 100644
--- a/r/man/LSI.Rd
+++ b/r/man/LSI.Rd
@@ -69,12 +69,16 @@ colnames(mat) <- paste0("cell", seq(ncols))
 
 #######################################################################
 ## LSI() example
-lsi_result <- LSI(mat, n_dimensions = 10)
 #######################################################################
+lsi_result <- LSI(mat, n_dimensions = 10)
+
+
 #######################################################################
 ## project(<LSI>) example
-dim(project(lsi_result, mat))
 #######################################################################
+dim(project(lsi_result, mat))
+
+
 }
 \seealso{
 \code{project()} \code{DimReduction()} \code{normalize_tfidf()} \code{normalize_log()} \code{svds()}
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index 9b4d165b..bfb9ebb3 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -94,10 +94,9 @@ This should be equivalent to \code{Seurat::FindVariableFeatures()} with \code{se
 and \code{scanpy.pp.highly_variable_genes()} with \code{flavor="seurat"}.
 }
 \examples{
-
 ## Prep data
 set.seed(12345)
-mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
+mat <- matrix(rpois(5*8, lambda=1), nrow=5, ncol=8)
 rownames(mat) <- paste0("gene", seq_len(nrow(mat)))
 mat
 mat <- as(mat, "IterableMatrix")
@@ -105,6 +104,7 @@ mat <- as(mat, "IterableMatrix")
 
 #######################################################################
 ## select_features_variance() examples
+#######################################################################
 select_features_variance(
     mat, 
     num_feats=2, 
@@ -112,7 +112,8 @@ select_features_variance(
 )
 
 # Because of how the BPCells `normalize` functions behave when the matrix 
-# argument is missing, we can also customize the normalization parameters using partial arguments:
+# argument is missing, we can also customize the normalization parameters 
+# using partial arguments:
 variable_features <- select_features_variance(
     mat,
     num_feats=2,
@@ -120,31 +121,38 @@ variable_features <- select_features_variance(
 ) 
 # One can then filter to only variable features using the subset operator:
 mat[variable_features$feature[variable_features$highly_variable],]
-#######################################################################
+
+
 #######################################################################
 ## select_features_dispersion() example
+#######################################################################
 select_features_dispersion(
   mat,
   num_feats = 2,
   normalize_method = normalize_log
 )
-#######################################################################
+
+
 #######################################################################
 ## select_features_mean() example
+#######################################################################
 select_features_mean(
   mat,
-  num_feats = 2,
+  num_feats = 1,
   normalize_method = normalize_log
 )
-#######################################################################
+
+
 #######################################################################
 ## select_features_binned_dispersion() example
+#######################################################################
 select_features_binned_dispersion(
   mat,
-  num_feats = 2
+  num_feats = 2,
   n_bins = 2
 )
-#######################################################################
+
+
 }
 \seealso{
 \code{normalize_tfidf()} \code{normalize_log()}
diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd
index 3eac4a60..c270cb09 100644
--- a/r/man/normalize.Rd
+++ b/r/man/normalize.Rd
@@ -60,6 +60,7 @@ mat <- as(mat, "IterableMatrix")
 
 #######################################################################
 ## normalize_log() examples
+#######################################################################
 normalize_log(mat)
 
 ## normalization functions can also be called with partial arguments 
@@ -67,9 +68,12 @@ partial_log <- normalize_log(scale_factor = 1e5)
 partial_log
 
 partial_log(mat)
-#######################################################################
+
+
 #######################################################################
 ## normalize_tfidf() example
-normalize_tfidf(mat)
 #######################################################################
+normalize_tfidf(mat)
+
+
 }

From 88f9c20c697f7f68cb390d7b6427941644058e16 Mon Sep 17 00:00:00 2001
From: Immanuel Abdi <56730419+immanuelazn@users.noreply.github.com>
Date: Tue, 15 Apr 2025 16:18:13 -0700
Subject: [PATCH 140/142] [r] add `print` method for dim reductions (#115)

---
 r/NAMESPACE            |  1 +
 r/R/singlecell_utils.R | 33 +++++++++++++++++++++++++++++----
 r/man/IterativeLSI.Rd  |  5 +++--
 r/man/LSI.Rd           |  5 +++--
 4 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index a4289b17..96a72269 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -2,6 +2,7 @@
 
 S3method(base::as.data.frame,IterableFragments)
 S3method(base::as.matrix,IterableMatrix)
+S3method(print,DimReduction)
 S3method(project,IterativeLSI)
 S3method(project,LSI)
 S3method(project,default)
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index e9a5aa9b..346cf5fe 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -292,6 +292,28 @@ project.default <- function(x, mat, ...) {
   rlang::abort("project method not implemented for objects that are not a fitted DimReduction")
 }
 
+#' @export
+print.DimReduction <- function(x, ...) {
+  cat(sprintf("Fitted <%s> dimensionality reduction\n\n", class(x)[1]))
+  
+  # Print feature info
+  cat("Number of features:", length(x$feature_names), "\n") 
+  cat("Input feature names:", pretty_print_vector(x$feature_names), "\n")
+
+  # Print embedding info
+  dim_embeddings <- dim(x$cell_embeddings)
+  cat(sprintf("cell_embeddings: %d cells x %d embedding dimensions\n", dim_embeddings[1], dim_embeddings[2]))
+
+  # Print param info
+  # params_str <- paste(names(x$fitted_params), collapse = ", ")
+  # wrapped_params <- strwrap(params_str, width = getOption("width") - 6)
+  # cat("Fitted_params:\n")
+  # for (ln in wrapped_params) {
+  #   cat("  ", ln, "\n", sep = "")
+  # }
+  cat("fitted_params: ", stringr::str_wrap(paste(names(x$fitted_params), collapse = ", "), exdent = 2, width = 60), "\n")
+}
+
 #################
 # LSI Implementation
 #################
@@ -333,9 +355,10 @@ project.default <- function(x, mat, ...) {
 #' 
 #' 
 #' #######################################################################
-#' ## LSI() example
+#' ## LSI() example 
 #' #######################################################################
 #' lsi_result <- LSI(mat, n_dimensions = 10)
+#' lsi_result
 #' 
 #' 
 #' @export
@@ -406,7 +429,7 @@ LSI <- function(
 #' @inheritParams project
 #' @examples
 #' #######################################################################
-#' ## project(<LSI>) example
+#' ## project(<LSI>) example 
 #' #######################################################################
 #' dim(project(lsi_result, mat))
 #' 
@@ -438,6 +461,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 }
 
 
+
 #' Run Iterative LSI on a matrix.
 #' 
 #' Given a `(features x cells)` counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.  
@@ -504,8 +528,8 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @inheritParams LSI
 #' @examples
 #' ## Prep data
-#' nrows <- 500
-#' ncols <- 10000
+#' nrows <- 350
+#' ncols <- 2000
 #' mat <- matrix(1:(nrows*ncols), nrow = nrows) %>% as("IterableMatrix")
 #' rownames(mat) <- paste0("feat", seq(nrows))
 #' colnames(mat) <- paste0("cell", seq(ncols))
@@ -529,6 +553,7 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #'     knn_to_graph_method = knn_to_snn_graph
 #'   )
 #' )
+#' dim_reduction
 #' 
 #' 
 #' @export
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index c23041a7..d5f36088 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -115,8 +115,8 @@ but can be done as a preprocessing step.
 }
 \examples{
 ## Prep data
-nrows <- 500
-ncols <- 10000
+nrows <- 350
+ncols <- 2000
 mat <- matrix(1:(nrows*ncols), nrow = nrows) \%>\% as("IterableMatrix")
 rownames(mat) <- paste0("feat", seq(nrows))
 colnames(mat) <- paste0("cell", seq(ncols))
@@ -140,6 +140,7 @@ dim_reduction <- IterativeLSI(
     knn_to_graph_method = knn_to_snn_graph
   )
 )
+dim_reduction
 
 
 #######################################################################
diff --git a/r/man/LSI.Rd b/r/man/LSI.Rd
index 99837048..bb28f17f 100644
--- a/r/man/LSI.Rd
+++ b/r/man/LSI.Rd
@@ -68,13 +68,14 @@ colnames(mat) <- paste0("cell", seq(ncols))
 
 
 #######################################################################
-## LSI() example
+## LSI() example 
 #######################################################################
 lsi_result <- LSI(mat, n_dimensions = 10)
+lsi_result
 
 
 #######################################################################
-## project(<LSI>) example
+## project(<LSI>) example 
 #######################################################################
 dim(project(lsi_result, mat))
 

From 0dd291b31baf3d3594d6636de8b3e1d3bb4fc684 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 17 Apr 2025 16:28:02 -0700
Subject: [PATCH 141/142] [r] add missing `feature_names` attr to
 `IterativeLSI()` documentation

---
 r/R/singlecell_utils.R | 2 ++
 r/man/IterativeLSI.Rd  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 346cf5fe..790a9fc7 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -479,6 +479,8 @@ project.LSI <- function(x, mat, threads = 1L, ...) {
 #' @return 
 #' `IterativeLSI()` An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
 #' - `cell_embeddings`: The projected data as a matrix of shape `(cells, n_dimensions)`
+#' - `feature_names`: The names of features that `IterativeLSI()` was fit on.  
+#'   Note that projection only requires the features used in each specific iteration (as described in `iter_info`)
 #' - `fitted_params`: A list of the parameters used for iterative LSI.  Includes the following:
 #'    - `lsi_method`: The method used for LSI
 #'    - `feature_selection_method`: The method used for selecting features
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
index d5f36088..0071ce71 100644
--- a/r/man/IterativeLSI.Rd
+++ b/r/man/IterativeLSI.Rd
@@ -48,6 +48,8 @@ The user can pass in partial parameters to the cluster method, such as by passin
 \code{IterativeLSI()} An object of class \code{c("IterativeLSI", "DimReduction")} with the following attributes:
 \itemize{
 \item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(cells, n_dimensions)}
+\item \code{feature_names}: The names of features that \code{IterativeLSI()} was fit on.
+Note that projection only requires the features used in each specific iteration (as described in \code{iter_info})
 \item \code{fitted_params}: A list of the parameters used for iterative LSI.  Includes the following:
 \itemize{
 \item \code{lsi_method}: The method used for LSI

From 1851026651f2c060641032782c4aa95e237d4ef9 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 21 Apr 2025 03:03:23 -0700
Subject: [PATCH 142/142] [r] remove commented out code in `print.Dimreduction`

---
 r/R/singlecell_utils.R | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 790a9fc7..11e5269e 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -305,12 +305,6 @@ print.DimReduction <- function(x, ...) {
   cat(sprintf("cell_embeddings: %d cells x %d embedding dimensions\n", dim_embeddings[1], dim_embeddings[2]))
 
   # Print param info
-  # params_str <- paste(names(x$fitted_params), collapse = ", ")
-  # wrapped_params <- strwrap(params_str, width = getOption("width") - 6)
-  # cat("Fitted_params:\n")
-  # for (ln in wrapped_params) {
-  #   cat("  ", ln, "\n", sep = "")
-  # }
   cat("fitted_params: ", stringr::str_wrap(paste(names(x$fitted_params), collapse = ", "), exdent = 2, width = 60), "\n")
 }