diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index 17faadb9..1f5a3842 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -48,7 +48,9 @@ Suggests:
     IRanges,
     GenomicRanges,
     matrixStats,
-    igraph
+    igraph,
+    RcppHNSW,
+    RcppAnnoy
 Depends: 
     R (>= 3.5.0)
 Config/Needs/website: pkgdown, devtools, uwot, irlba, RcppHNSW, igraph, BiocManager, bioc::BSgenome.Hsapiens.UCSC.hg38, github::GreenleafLab/motifmatchr, github::GreenleafLab/chromVARmotifs
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 87862f4b..96a72269 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -2,11 +2,18 @@
 
 S3method(base::as.data.frame,IterableFragments)
 S3method(base::as.matrix,IterableMatrix)
+S3method(print,DimReduction)
+S3method(project,IterativeLSI)
+S3method(project,LSI)
+S3method(project,default)
 S3method(svds,IterableMatrix)
 S3method(svds,default)
 export("all_matrix_inputs<-")
 export("cellNames<-")
 export("chrNames<-")
+export(DimReduction)
+export(IterativeLSI)
+export(LSI)
 export(add_cols)
 export(add_rows)
 export(all_matrix_inputs)
@@ -20,6 +27,7 @@ export(canonical_gene_symbol)
 export(cellNames)
 export(checksum)
 export(chrNames)
+export(cluster_cells_graph)
 export(cluster_graph_leiden)
 export(cluster_graph_louvain)
 export(cluster_graph_seurat)
@@ -65,6 +73,8 @@ export(min_by_row)
 export(min_scalar)
 export(multiply_cols)
 export(multiply_rows)
+export(normalize_log)
+export(normalize_tfidf)
 export(nucleosome_counts)
 export(open_fragments_10x)
 export(open_fragments_dir)
@@ -83,6 +93,7 @@ export(plot_tf_footprint)
 export(plot_tss_profile)
 export(plot_tss_scatter)
 export(prefix_cell_names)
+export(project)
 export(pseudobulk_matrix)
 export(qc_scATAC)
 export(range_distance_to_nearest)
@@ -108,6 +119,10 @@ export(rowVars.default)
 export(sctransform_pearson)
 export(select_cells)
 export(select_chromosomes)
+export(select_features_binned_dispersion)
+export(select_features_dispersion)
+export(select_features_mean)
+export(select_features_variance)
 export(select_regions)
 export(set_trackplot_height)
 export(set_trackplot_label)
diff --git a/r/NEWS.md b/r/NEWS.md
index e75ce4a0..c695e83b 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -8,8 +8,17 @@ Contributions welcome :)
 
 # BPCells 0.3.1 (in-progress main branch)
 
+## Breaking changes
+- Change first parameter name of `cluster_graph_leiden()`, `cluster_graph_louvain()` and `cluster_graph_seurat()` from `snn` to `mat` to more accurately reflect the input type.  (pull request #189)
+
 ## Features
 - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166)
+- Add normalization helper functions `normalize_log()` and `normalize_tfidf()` (pull request #168)
+- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #189)
+- Add feature selection functions `select_features_variance()`, and `select_features_{dispersion,mean,binned_dispersion}()`, with parameterization for normalization steps, and number of variable features (pull request #189)
+- Add `LSI()` and `IterativeLSI()` dimensionality functions to perform latent semantic indexing on a matrix (pull request #189).
+- Add capability to create partial function objects in when excluding the first argument of a function.  This is implemented in normalizations, feature selections, dimensionality reductions, and clustering functions. See `select_features_variance()` for usage.  (pull request #189)
+- Create a wrapper function `cluster_cells_graph()` that wraps the steps of knn object creation, graph adjacency creation, and clustering all within a single function (pull request #189)
 
 ## Improvements
 - Speed up taking large subsets of large concatenated matrices, e.g. selecting 9M cells from a 10M cell matrix composed of ~100 concatenated pieces. (pull request #179)
diff --git a/r/R/clustering.R b/r/R/clustering.R
index b53a4edf..f032800f 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -6,6 +6,84 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
+
+#' Check if an input is a kNN object
+#'
+#' knn object functions `knn_hnsw()` and `knn_annoy()` return a list of two matrices, `idx` and `dist`.
+#' These are used as inputs to create graph adjacency matrices for clustering.
+#' Assume any list with at least both `idx` and `dist` items is a kNN object.
+#' @return TRUE if the mat is a knn object, FALSE otherwise
+#' @keywords internal
+is_knn_object <- function(mat) {
+  return(is(mat, "list") && all(c("idx", "dist") %in% names(mat)))
+}
+
+#' Check if an input is a graph adjacency matrix.
+#'
+#' Clustering functions like `cluster_graph_leiden()` and `cluster_graph_louvain()` require a graph adjacency matrix as input.
+#' We assume that any square `dgCMatrix` is a graph adjacency matrix.
+#' @return TRUE if the mat is a graph adjacency matrix, FALSE otherwise
+#' @keywords internal
+is_adjacency_matrix <- function(mat) {
+  return(is(mat, "dgCMatrix") && nrow(mat) == ncol(mat))
+}
+
+#' Cluster embeddings using a KNN-Graph based algorithm
+#' 
+#' Take in a cell embedding matrix, and sequentially convert it into a kNN object, then to
+#' a graph adjacency matrix.  Following, assign a label to every cell using a clustering algorithm.
+#' 
+#' @param mat (matrix) Cell embeddings matrix of shape `(cells x n_embeddings)`
+#' @param knn_method (function) Function to convert cell embeddings into a knn object.
+#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`. 
+#' @param knn_to_graph_method (function) Function to convert the knn object returned from `knn_method` to a graph adjacency matrix.
+#' Must be a (optionally partialized) version of `knn_to_graph()`, `knn_to_snn_graph()` or `knn_to_geodesic_graph()`.  
+#' @param graph_to_cluster_method (function) Clustering algorithm that converts a graph adjacency matrix 
+#' returned from `graph_to_cluster_method` into cluster labels for each cell.
+#' Must be a (optionally partialized) version of `cluster_graph_leiden()`, `cluster_graph_louvain()` or `cluster_graph_seurat()`.
+#' @param threads (integer) Number of threads to use in `knn_method`, `knn_to_graph_method` and `graph_to_cluster_method`.  If these functions do not utilize
+#' a `threads` argument, this is silently ignored.
+#' @param verbose (logical) Whether to print progress information in `knn_method`, `knn_to_graph_method` and `graph_to_cluster_method`.  If these functions do not utilize 
+#' a `verbose` argument, this is silently ignored.
+#' @returns (factor) Factor vector containing the cluster assignment for each cell.
+#' @details 
+#' `cluster_cells_graph()` acts as a helper function to wrap input creation and `kNN` graph adjacency-based clustering to be done together.  The user
+#' can also manually pass cell embeddings to their preferred knn/clustering functions of choices.  
+#' 
+#' **Clustering customization through partialized parameters**
+#' 
+#' Customization of clustering is possible through partialization of each parameter in `cluster_cells_graph()` that is a function.  
+#' In detail, each parameter that requests a function 
+#' may take in one with only some of the arguments provided.  If the first argument is not provided, a copy of a function is utilized that has its parameters
+#' changed with the arguments provided.
+#' 
+#' For instance, if the user desires for `cluster_cells_graph()` to instead use `cluster_graph_louvain()` with resolution different than the default,
+#' they can instead call `cluster_cells_graph()` like so: 
+#' `cluster_cells_graph(mat, graph_to_cluster_method = cluter_graph_louvain(resolution = 0.5))`
+#' @seealso `knn_hnsw()` `knn_annoy()` `knn_to_graph()` `knn_to_snn_graph()` `knn_to_geodesic_graph()` `cluster_graph_leiden()` `knn_to_snn_graph()` `knn_to_geodesic_graph()`
+#' @export
+cluster_cells_graph <- function(
+  mat, knn_method = knn_hnsw, 
+  knn_to_graph_method = knn_to_geodesic_graph, 
+  graph_to_cluster_method = cluster_graph_leiden, 
+  threads = 0L, verbose = FALSE
+) {
+  assert_is_wholenumber(threads)
+  assert_is(verbose, "logical")
+  if (rlang::is_missing(mat)) return(create_partial())
+  assert_is(mat, "matrix")
+  # There currently aren't any `knn_to_graph` functions that utilize a verbose argument.  
+  # However, we still pass `verbose` in case future functions do provide this functionality.
+  mat <- partial_apply(knn_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
+  if (!is_knn_object(mat)) pretty_error(mat, "`knn_method` was unable to convert `mat` into a knn object", 1)
+  # Return type has to be constrained to "matrix", so this is silently provided.
+  mat <- partial_apply(knn_to_graph_method, threads = threads, verbose = verbose, return_type = "matrix", .missing_args_error = FALSE)(mat)
+  if (!is_adjacency_matrix(mat)) pretty_error(mat, "`knn_to_graph_method` was unable to convert `mat` from a knn object to a graph adjacency matrix", 1)
+  # Also pass verbose and threads to clustering functions in case they are given these params in the future
+  mat <- partial_apply(graph_to_cluster_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
+  return(mat)
+}
+
 #' K Nearest Neighbor (KNN) Graph
 #'
 #' Convert a KNN object (e.g. returned by `knn_hnsw()` or `knn_annoy()`) into
@@ -22,6 +100,11 @@
 #'   Sparse matrix (dgCMatrix) where `mat[i,j]` = distance from cell `i` to
 #'   cell `j`, or 0 if cell `j` is not in the K nearest neighbors of `i`
 knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) {
+  assert_is(use_weights, "logical")
+  assert_is(self_loops, "logical")
+
+  if (rlang::is_missing(knn)) return(create_partial())
+  assert_true(is_knn_object(knn))
   if (use_weights) {
     weights <- knn$dist
   } else {
@@ -41,7 +124,6 @@ knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) {
   mat
 }
 
-
 #' @rdname knn_graph
 #' @details **knn_to_snn_graph**
 #'  Convert a knn object into a shared nearest neighbors adjacency matrix.
@@ -63,6 +145,9 @@ knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) {
 #' @export
 knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_type=c("matrix", "list")) {
   return_type <- match.arg(return_type)
+  assert_is(self_loops, "logical")
+  if (rlang::is_missing(knn)) return(create_partial())
+  assert_true(is_knn_object(knn))
   # Solve x / (2*K - x) >= min_val --> x >= 2*K*min_val / (1 + min_val)
   min_int <- ceiling(2*min_val*ncol(knn$idx) / (1 + min_val))
   snn <- build_snn_graph_cpp(knn$idx, min_neighbors = min_int)
@@ -83,13 +168,15 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t
   }
   
   # Return as a sparse matrix
-  Matrix::sparseMatrix(
+  res <- Matrix::sparseMatrix(
     i = snn$i + 1L, j = snn$j + 1L, x = snn$weight,
     dims = c(snn$dim, snn$dim)
   )
+  return(res)
 }
 
 #' @rdname knn_graph
+#' @param threads Number of threads to use during calculations
 #' @details **knn_to_geodesic_graph**
 #'  Convert a knn object into an undirected weighted graph, using the same 
 #'  geodesic distance estimation method as the UMAP package. 
@@ -101,7 +188,6 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t
 #'  neighbor, results may differ slightly from `umap._umap.fuzzy_simplicial_set`, which
 #'  assumes self is always successfully found in the approximate nearest neighbor search.
 #'  
-#' @param threads Number of threads to use during calculations
 #' @return **knn_to_geodesic_graph**
 #' - `return_type == "matrix"`:
 #'      Sparse matrix (dgCMatrix) where `mat[i,j]` = normalized similarity between cell `i` and cell `j`.
@@ -111,29 +197,31 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t
 #'      These correspond to the rows, cols, and values of non-zero entries in the lower triangle
 #'      adjacency matrix. `dim` is the total number of vertices (cells) in the graph 
 #' @export
-knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=0L) {
+knn_to_geodesic_graph <- function(knn, return_type = c("matrix", "list"), threads = 0L) {
   return_type <- match.arg(return_type)
+  assert_is_wholenumber(threads)
+  if (rlang::is_missing(knn)) return(create_partial())
+  assert_true(is_knn_object(knn))
   graph <- build_umap_graph_cpp(knn$dist, knn$idx, threads=threads)
   
   graph$dim <- nrow(knn$idx)
-  if (return_type == "list") {
-    return(graph)
-  }
+  if (return_type == "list") return(graph)
   
   # Return as a sparse matrix
-  Matrix::sparseMatrix(
+  res <- Matrix::sparseMatrix(
     i = graph$i + 1L, j = graph$j + 1L, x = graph$weight,
     dims = c(graph$dim, graph$dim)
   )
+  return(res)
 }
 
 #' Cluster an adjacency matrix
-#' @rdname cluster
+#' @rdname cluster_graph
 #' @details **cluster_graph_leiden**: Leiden clustering algorithm `igraph::cluster_leiden()`. 
 #'    Note that when using `objective_function = "CPM"` the number of clusters empirically scales with `cells * resolution`,
 #'    so 1e-3 is a good resolution for 10k cells, but 1M cells is better with a 1e-5 resolution. A resolution of 1 is a 
 #'    good default when `objective_function = "modularity"` per the default.
-#' @param snn Symmetric adjacency matrix (dgCMatrix) output from e.g. `knn_to_snn_graph()` or `knn_to_geodesic_graph()`. Only the lower triangle is used
+#' @param mat Symmetric adjacency matrix (dgCMatrix) output from e.g. `knn_to_snn_graph()` or `knn_to_geodesic_graph()`. Only the lower triangle is used.
 #' @param resolution Resolution parameter. Higher values result in more clusters
 #' @param objective_function Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below). 
 #'    For the meaning of each option, see `igraph::cluster_leiden()`.
@@ -141,44 +229,55 @@ knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=
 #' @param ... Additional arguments to underlying clustering function
 #' @return Factor vector containing the cluster assignment for each cell.
 #' @export
-cluster_graph_leiden <- function(snn, resolution = 1, objective_function = c("modularity", "CPM"), seed = 12531, ...) {
+cluster_graph_leiden <- function(
+  mat, resolution = 1, objective_function = c("modularity", "CPM"),
+  seed = 12531, ...
+) {
   assert_has_package("igraph")
   # Set seed without permanently changing seed state
+  if (rlang::is_missing(mat)) return(create_partial())
   prev_seed <- get_seed()
   on.exit(restore_seed(prev_seed), add = TRUE)
   set.seed(seed)
 
   objective_function <- match.arg(objective_function)
 
-  igraph::graph_from_adjacency_matrix(snn, weighted = TRUE, diag = FALSE, mode = "lower") %>%
+  igraph::graph_from_adjacency_matrix(mat, weighted = TRUE, diag = FALSE, mode = "lower") %>%
     igraph::cluster_leiden(resolution_parameter = resolution, objective_function=objective_function, ...) %>%
     igraph::membership() %>%
     as.factor()
 }
 
 
-#' @rdname cluster
+#' @rdname cluster_graph
 #' @details **cluster_graph_louvain**: Louvain graph clustering algorithm `igraph::cluster_louvain()`
 #' @export
-cluster_graph_louvain <- function(snn, resolution = 1, seed = 12531) {
+cluster_graph_louvain <- function(
+  mat, resolution = 1, seed = 12531
+) {
   assert_has_package("igraph")
   # Set seed without permanently changing seed state
+  if (rlang::is_missing(mat)) return(create_partial())
+
   prev_seed <- get_seed()
   on.exit(restore_seed(prev_seed), add = TRUE)
   set.seed(seed)
 
-  igraph::graph_from_adjacency_matrix(snn, weighted = TRUE, diag = FALSE, mode = "lower") %>%
+  igraph::graph_from_adjacency_matrix(mat, weighted = TRUE, diag = FALSE, mode = "lower") %>%
     igraph::cluster_louvain(resolution = resolution) %>%
     igraph::membership() %>%
     as.factor()
 }
 
-#' @rdname cluster
+#' @rdname cluster_graph
 #' @details **cluster_graph_seurat**: Seurat's clustering algorithm `Seurat::FindClusters()`
 #' @export
-cluster_graph_seurat <- function(snn, resolution = 0.8, ...) {
+cluster_graph_seurat <- function(
+  mat, resolution = 0.8, ...
+) {
   assert_has_package("Seurat")
-  Seurat::as.Graph(snn) %>%
+  if (rlang::is_missing(mat)) return(create_partial())
+  Seurat::as.Graph(mat) %>%
     Seurat::FindClusters(resolution = resolution, ...) %>%
     .[[1]]
 }
@@ -213,7 +312,7 @@ cluster_membership_matrix <- function(groups, group_order = NULL) {
 }
 
 
-#' Get a knn matrix from reduced dimensions
+#' Get a knn object from reduced dimensions
 #'
 #' Search for approximate nearest neighbors between cells in the reduced
 #' dimensions (e.g. PCA), and return the k nearest neighbors (knn) for each
@@ -228,7 +327,7 @@ cluster_membership_matrix <- function(groups, group_order = NULL) {
 #' @param metric distance metric to use
 #' @param threads Number of threads to use. Note that result is non-deterministic
 #'          if threads > 1
-#' @param ef ef parameter for RccppHNSW::hnsw_search. Increase for slower search but
+#' @param ef ef parameter for `RcppHNSW::hnsw_search()`. Increase for slower search but
 #'          improved accuracy
 #' @param verbose whether to print progress information during search
 #' @return List of 2 matrices -- idx for cell x K neighbor indices,
@@ -238,6 +337,10 @@ cluster_membership_matrix <- function(groups, group_order = NULL) {
 #' @export
 knn_hnsw <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine"), verbose = TRUE, threads = 1, ef = 100) {
   metric <- match.arg(metric)
+  assert_is(verbose, "logical")
+  assert_is_wholenumber(threads)
+  assert_has_package("RcppHNSW")
+  if (rlang::is_missing(data)) return(create_partial())
   index <- RcppHNSW::hnsw_build(
     data,
     distance = metric,
@@ -271,8 +374,11 @@ knn_hnsw <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine
 #' @param n_trees Number of trees during index build time. More trees gives higher accuracy
 #' @param search_k Number of nodes to inspect during the query, or -1 for default value. Higher number gives higher accuracy
 #' @export
-knn_annoy <- function(data, query = data, k = 10, metric = c("euclidean", "cosine", "manhattan", "hamming"), n_trees = 50, search_k = -1) {
+knn_annoy <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine", "manhattan", "hamming"), n_trees = 50, search_k = -1) {
   metric <- match.arg(metric)
+  assert_has_package("RcppAnnoy")
+  if (rlang::is_missing(data)) return(create_partial())
+  if (is.null(query)) query <- data
   annoy <- switch(metric,
     "euclidean" = new(RcppAnnoy::AnnoyEuclidean, ncol(data)),
     "cosine" = new(RcppAnnoy::AnnoyAngular, ncol(data)),
@@ -294,5 +400,6 @@ knn_annoy <- function(data, query = data, k = 10, metric = c("euclidean", "cosin
     dist[i, ] <- res$dist
   }
   if (metric == "cosine") dist <- 0.5 * (dist * dist)
-  list(idx = idx, dist = dist)
+  res <- list(idx = idx, dist = dist)
+  return(res)
 }
diff --git a/r/R/errorChecking.R b/r/R/errorChecking.R
index 7691ab98..ed24087d 100644
--- a/r/R/errorChecking.R
+++ b/r/R/errorChecking.R
@@ -113,7 +113,12 @@ assert_is <- function(object, class, n = 1) {
     if (!match) pretty_error(object, sprintf("must have class %s", paste0(class, collapse = ", or ")), n)
   }
 }
-
+assert_is_mat <- function(object, n = 1) {
+  # matrices have length set to row*col instead of being 1, so we need to check dim as well
+  if (!canCoerce(object, "IterableMatrix")) {
+    pretty_error(object, "must either be an IterableMatrix or coercible to an IterableMatrix", n)
+  }
+}
 assert_true <- function(expr, n = 1) {
   if (!expr) pretty_error(expr, "is not true", n)
 }
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index c77a9f26..11e5269e 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -1,4 +1,4 @@
-# Copyright 2023 BPCells contributors
+# Copyright 2025 BPCells contributors
 # 
 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 # https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
@@ -6,6 +6,700 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
+
+#################
+# Feature selection
+#################
+
+#' Feature selection functions
+#' 
+#' Apply a feature selection method to a non-normalized `(features x cells)` matrix.  We recommend using counts matrices as input and to
+#' apply any normalizations prior to feature selection via the normalize argument (if available).  
+#' Instead of directly subsetting the input matrix,
+#' an output dataframe is returned, indicating which features are highly variable, and the scoring of each feature.
+#' @rdname feature_selection
+#' @param mat (IterableMatrix) Counts matrix with dimensions `(features x cells)`.
+#' @param num_feats (float) Number of features to mark as highly_variable. If 0 < `num_feats` < 1, then interpret as a fraction of features.
+#' @param normalize_method (function) Used to normalize the matrix prior to feature selection by calling `normalize_method(mat)` if it is not `NULL`. 
+#' For example, pass `normalize_log()` or `normalize_tfidf()`. 
+#' @param threads (integer) Number of threads to use. Also overrides the threads argument in `normalize_method`
+#' @returns
+#' Return a dataframe with the following columns:
+#' - `feature`: Feature name.
+#' - `score`: Scoring of the feature, depending on the method used.  
+#' - `highly_variable`: Logical vector of whether the feature is highly variable.
+#'
+#' Each different feature selection method will have a different scoring method.
+#' Consider a matrix \eqn{X}, where the row index \eqn{i} refers to each feature
+#' and the column index \eqn{j} refers to each cell. For each feature \eqn{x_{i} \in X}, we define the following feature-selection scores:
+#' - `select_features_variance`: \eqn{\mathrm{Score}(x_i) = \frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}
+#' @examples
+#' ## Prep data
+#' set.seed(12345)
+#' mat <- matrix(rpois(5*8, lambda=1), nrow=5, ncol=8)
+#' rownames(mat) <- paste0("gene", seq_len(nrow(mat)))
+#' mat
+#' mat <- as(mat, "IterableMatrix")
+#' 
+#' 
+#' #######################################################################
+#' ## select_features_variance() examples
+#' #######################################################################
+#' select_features_variance(
+#'     mat, 
+#'     num_feats=2, 
+#'     normalize_method=normalize_log
+#' )
+#' 
+#' # Because of how the BPCells `normalize` functions behave when the matrix 
+#' # argument is missing, we can also customize the normalization parameters 
+#' # using partial arguments:
+#' variable_features <- select_features_variance(
+#'     mat,
+#'     num_feats=2,
+#'     normalize_method=normalize_log(scale_factor=20)
+#' ) 
+#' # One can then filter to only variable features using the subset operator:
+#' mat[variable_features$feature[variable_features$highly_variable],]
+#' 
+#' 
+#' @seealso `normalize_tfidf()` `normalize_log()`
+#' @export
+select_features_variance <- function(
+  mat, num_feats = 0.05, 
+  normalize_method = NULL,
+  threads = 1L
+) {
+  assert_greater_than_zero(num_feats)
+  assert_len(num_feats, 1)
+  assert_is_wholenumber(threads)
+  if (rlang::is_missing(mat)) return(create_partial())
+  assert_is_mat(mat)
+  
+  if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
+  if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
+    rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat))))
+  }
+  num_feats <- min(max(num_feats, 0), nrow(mat))
+  if (!is.null(normalize_method)) mat <- partial_apply(normalize_method, threads = threads, .missing_args_error = FALSE)(mat)
+  features_df <- tibble::tibble(
+    feature = rownames(mat),
+    score = matrix_stats(mat, row_stats = "variance", threads = threads)$row_stats["variance",]
+  ) %>% 
+    dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats)
+  return(features_df)
+}
+
+
+#' @rdname feature_selection
+#' @returns
+#' - `select_features_dispersion`: \eqn{\mathrm{Score}(x_i) = \frac{\frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}{\bar{x}_i}}
+#' @examples
+#' #######################################################################
+#' ## select_features_dispersion() example
+#' #######################################################################
+#' select_features_dispersion(
+#'   mat,
+#'   num_feats = 2,
+#'   normalize_method = normalize_log
+#' )
+#' 
+#' 
+#' @export
+select_features_dispersion <- function(
+  mat, num_feats = 0.05, 
+  normalize_method = NULL,
+  threads = 1L
+) {
+  assert_greater_than_zero(num_feats)
+  assert_len(num_feats, 1)
+  assert_is_wholenumber(threads)
+  if (rlang::is_missing(mat)) return(create_partial())
+  if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
+  if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
+    rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat))))
+  }
+  num_feats <- min(max(num_feats, 0), nrow(mat))
+  if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix")
+  assert_is(mat, "IterableMatrix")
+  if (!is.null(normalize_method)) mat <- partial_apply(normalize_method, threads = threads, .missing_args_error = FALSE)(mat)
+  mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads)
+  features_df <- tibble::tibble(
+    feature = rownames(mat),
+    score = mat_stats$row_stats["variance", ] / mat_stats$row_stats["mean", ]
+  ) %>% 
+    dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats)
+  return(features_df)
+}
+
+#' @rdname feature_selection
+#' @returns
+#' - `select_features_mean`: \eqn{\mathrm{Score}(x_i) = \frac{\sum_{j=1}^{n}\bigl(x_{ij}\bigr)}{n}}
+#' @examples
+#' #######################################################################
+#' ## select_features_mean() example
+#' #######################################################################
+#' select_features_mean(
+#'   mat,
+#'   num_feats = 1,
+#'   normalize_method = normalize_log
+#' )
+#' 
+#' 
+#' @export
+select_features_mean <- function(mat, num_feats = 0.05, normalize_method = NULL, threads = 1L) {
+  assert_greater_than_zero(num_feats)
+  assert_is_wholenumber(threads)
+  if (rlang::is_missing(mat)) return(create_partial())
+  assert_is_mat(mat)
+  if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
+  if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
+    rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat))))
+  }
+  num_feats <- min(max(num_feats, 0), nrow(mat))
+  if (!is.null(normalize_method)) mat <- partial_apply(normalize_method, threads = threads, .missing_args_error = FALSE)(mat)
+  # get the sum of each feature, binarized
+  features_df <- tibble::tibble(
+    feature = rownames(mat),
+    score = matrix_stats(mat, row_stats = "mean", threads = threads)$row_stats["mean", ]
+  ) %>%
+    dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats)
+  return(features_df)
+}
+
+#' @rdname feature_selection
+#' @param n_bins (integer) Number of bins to split features into in order to control for the relationship between mean expression and dispersion (see details).
+#' @returns
+#'  - `select_features_binned_dispersion`: Process described in `details`.
+#' @details 
+#' `select_features_binned_dispersion` implements the approach from Satija et al. 2015:
+#'  1. Bin features into equal-width bins by `log1p(mean)`
+#'  2. Calculate dispersion of each feature as `log(variance / mean)`
+#'  3. Z-score normalize dispersion within each bin, and select highest normalized dispersion across all bins
+#' 
+#' If the number of features within a bin is equal to 1, then the mean dispersion for that bin is set to 1.
+#' 
+#' This should be equivalent to `Seurat::FindVariableFeatures()` with `selection.method="mean.var.plot"`
+#'  and `scanpy.pp.highly_variable_genes()` with `flavor="seurat"`.
+#' @examples
+#' #######################################################################
+#' ## select_features_binned_dispersion() example
+#' #######################################################################
+#' select_features_binned_dispersion(
+#'   mat,
+#'   num_feats = 2,
+#'   n_bins = 2
+#' )
+#' 
+#' 
+#' @export
+select_features_binned_dispersion <- function(
+  mat, num_feats = 0.05, n_bins = 20,
+  threads = 1L
+) {
+  assert_greater_than_zero(num_feats)
+  assert_len(num_feats, 1)
+  assert_is_wholenumber(n_bins)
+  assert_len(n_bins, 1)
+  assert_greater_than_zero(n_bins)
+  assert_is_wholenumber(threads)
+  if (rlang::is_missing(mat)) return(create_partial())
+  assert_is_mat(mat)
+  if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
+  if (min(max(num_feats, 0), nrow(mat)) != num_feats) {
+    rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat))))
+  }
+  num_feats <- min(max(num_feats, 0), nrow(mat))
+  # Calculate row information for dispersion
+  mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
+  feature_means <- mat_stats$row_stats["mean", ]
+  feature_vars <- mat_stats$row_stats["variance", ]
+  # Calculate dispersion, and log normalize
+  feature_dispersion <- feature_vars / feature_means
+  feature_dispersion[feature_dispersion == 0] <- NA
+  feature_dispersion <- log(feature_dispersion)
+  feature_dispersion[feature_means == 0] <- 0
+  feature_means <- log1p(feature_means)
+  features_df <- tibble::tibble(
+    feature = names(feature_means),
+    var = feature_vars,
+    mean = feature_means,
+    dispersion = feature_dispersion
+  )
+  # Bin by mean, and normalize dispersion with each bin
+  features_df <- features_df %>%
+    dplyr::mutate(bin = cut(mean, n_bins, labels = FALSE)) %>% 
+    dplyr::group_by(bin) %>%
+    dplyr::mutate(
+      score = (dispersion - mean(dispersion)) / sd(dispersion),
+      score = if (dplyr::n() == 1) {1} else {score} # Set feats that are in bins with only one feat to have a norm dispersion of 1  
+    ) %>%
+    dplyr::ungroup() %>%
+    dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats) %>% 
+    dplyr::select(c("feature", "score", "highly_variable", "dispersion", "bin")) %>%
+    dplyr::rename("raw_log_dispersion" = "dispersion")
+  return(features_df)
+}
+
+
+#################
+# DimReduction Class Definition
+#################
+
+#' Barebones definition of a DimReduction class.
+#' 
+#' Represents a latent space output of a matrix after a transformation function, with any required information to reproject other inputs using this object.
+#' Child classes should implement a `project` method to allow for the projection of other matrices using
+#' the fitted transformation object.
+#' @rdname DimReduction
+#' @field cell_embeddings (IterableMatrix, dgCMatrix, matrix) Projected data of shape `(cells x n_dimensions)` of the original matrix after a dimensionality reduction.
+#' @field fitted_params (list) A list of parameters used for the transformation of a matrix.  This should include all necessary information to project new data with the same features.
+#' @field feature_names (character) The names of the features that this DimReduction object was fit on.  Matrices to be projected should have the same feature names.
+#' @return - `DimReduction()`: DimReduction object.
+#' @export
+DimReduction <- function(cell_embeddings, fitted_params = list(), feature_names = character(0), ...) {
+  assert_is(cell_embeddings, c("IterableMatrix", "dgCMatrix", "matrix"))
+  assert_is(fitted_params, "list")
+  structure(list(
+    cell_embeddings = cell_embeddings,
+    fitted_params = fitted_params,
+    feature_names = feature_names,
+    ...
+    ),
+    class = "DimReduction"
+  )
+}
+
+#' @rdname DimReduction
+#' @param (IterableMatrix) Input matrix of shape `(features x cells)`.
+#' @param x DimReduction object.
+#' @return - `project()`: IterableMatrix object of the projected data.
+#' @details 
+#' `project()`: Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.
+#' 
+#' DimReduction subclasses should use the `project` method on new data with the same features, to project into the same latent space.
+#' All required information to run a projection should be held in `x$fitted_params`, including pertinent parameters when constructing the DimReduction subclass object.
+#' 
+#' If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features.
+#' If there are rownames, reorder the matrix to match the order of the original matrix
+#' @inheritParams DimReduction
+#' @export
+project <- function(x, mat, ...) {
+  UseMethod("project")
+}
+#' @export 
+project.default <- function(x, mat, ...) {
+  rlang::abort("project method not implemented for objects that are not a fitted DimReduction")
+}
+
+#' @export
+print.DimReduction <- function(x, ...) {
+  cat(sprintf("Fitted <%s> dimensionality reduction\n\n", class(x)[1]))
+  
+  # Print feature info
+  cat("Number of features:", length(x$feature_names), "\n") 
+  cat("Input feature names:", pretty_print_vector(x$feature_names), "\n")
+
+  # Print embedding info
+  dim_embeddings <- dim(x$cell_embeddings)
+  cat(sprintf("cell_embeddings: %d cells x %d embedding dimensions\n", dim_embeddings[1], dim_embeddings[2]))
+
+  # Print param info
+  cat("fitted_params: ", stringr::str_wrap(paste(names(x$fitted_params), collapse = ", "), exdent = 2, width = 60), "\n")
+}
+
+#################
+# LSI Implementation
+#################
+
+
+#' Perform latent semantic indexing (LSI) on a matrix.
+#' 
+#' Given a `(features x cells)` counts matrix, perform LSI, which sequentially executes tf-idf normalization and PCA to create a latent space representation 
+#' of the matrix of shape `(n_dimensions, ncol(mat))`.
+#' Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space.
+#' @rdname LSI
+#' @param mat (IterableMatrix) Counts matrix of shape `(features x cells)`.
+#' @param n_dimensions (integer) Number of dimensions to keep during PCA.
+#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is greater or equal to
+#' the corr_cutoff, it will be excluded from the final PCA matrix.
+#' @param scale_factor (numeric) Scaling factor to multiply matrix by during tf-idf normalization.
+#' @param threads (integer) Number of threads to use.
+#' @returns 
+#' `LSI()` An object of class `c("LSI", "DimReduction")` with the following attributes:
+#' - `cell_embeddings`: The projected data as a matrix of shape `(cells, n_dimensions)`
+#' - `fitted_params`: A named list of the parameters used for LSI, including the following:
+#'   - `scale_factor`: The scale factor used for tf-idf normalization
+#'   - `feature_means`: The means of the features used for normalization
+#'   - `pcs_to_keep`: The PCs that were kept after filtering by correlation to sequencing depth
+#'   - `feature_loadings`: SVD component with dimension `(n_variable_features, n_dimensions)` used to linearly transform input data into cell embeddings
+#' - `feature_names`: The names of the features in the matrix
+#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
+#' 
+#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
+#' - 17.1 MB memory usage, 25.1 seconds runtime
+#' @seealso `project()` `DimReduction()` `normalize_tfidf()` `normalize_log()` `svds()`
+#' @examples
+## Prep data
+#' nrows <- 50
+#' ncols <- 1000
+#' mat <- matrix(1:(nrows*ncols), nrow = nrows) %>% as("IterableMatrix")
+#' rownames(mat) <- paste0("feat", seq(nrows))
+#' colnames(mat) <- paste0("cell", seq(ncols))
+#' 
+#' 
+#' #######################################################################
+#' ## LSI() example 
+#' #######################################################################
+#' lsi_result <- LSI(mat, n_dimensions = 10)
+#' lsi_result
+#' 
+#' 
+#' @export
+LSI <- function(
+  mat, n_dimensions = 50L, corr_cutoff = 1, scale_factor = 1e4,
+  threads = 1L, verbose = FALSE
+) {
+  if (rlang::is_missing(mat)) {
+    return(create_partial())
+  }
+  assert_is_mat(mat)
+  assert_is_wholenumber(n_dimensions)
+  assert_len(n_dimensions, 1)
+  assert_greater_than_zero(n_dimensions)
+  
+  assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
+  assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1))
+  assert_is_wholenumber(threads)
+  
+  if (verbose) log_progress("Starting LSI")
+  # Normalization
+  if (verbose) log_progress("Normalizing matrix")
+  mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"), threads = threads)
+  read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
+  mat <- normalize_tfidf(
+    mat = mat,
+    feature_means = mat_stats$row_stats["mean", ],
+    scale_factor = scale_factor,
+    threads = threads
+  )
+  # Save to prevent repeating queued calculations during SVD
+  temp_mat_dir <- tempfile("mat")
+  mat <- write_matrix_dir(
+    convert_matrix_type(mat, type = "float"),
+    temp_mat_dir, compress = TRUE
+  )
+  on.exit(unlink(temp_mat_dir, recursive = TRUE, expand = FALSE))
+  # Run pca
+  if (verbose) log_progress("Calculating SVD")
+  svd_attr <- svds(mat, k = n_dimensions, threads = threads)
+  pca_res <- svd_attr$v %*% diag(svd_attr$d)
+  rownames(pca_res) <- colnames(mat)
+  
+  # Filter out PCs that are highly correlated with sequencing depth
+  pca_corrs <- abs(cor(read_depth, pca_res))
+  pca_feats_to_keep <- which(pca_corrs < corr_cutoff)
+  if (length(pca_feats_to_keep) != n_dimensions) {
+    if (verbose) log_progress(sprintf("Dropping PCs %s due to high correlation with sequencing depth", paste(setdiff(1:n_dimensions, pca_feats_to_keep), collapse = ", ")))
+    pca_res <- pca_res[, pca_feats_to_keep, drop = FALSE]
+  }
+  fitted_params <- list(
+    scale_factor = scale_factor,
+    feature_means = mat_stats$row_stats["mean", ],
+    pcs_to_keep = pca_feats_to_keep,
+    feature_loadings = svd_attr$u
+  )
+  res <- DimReduction(
+    cell_embeddings = pca_res,
+    fitted_params = fitted_params,
+    feature_names = rownames(mat)
+  )
+  class(res) <- c("LSI", class(res))
+  return(res)
+}
+#' @rdname LSI
+#' @return 
+#' `project()` IterableMatrix of the projected data of shape `(n_dimensions, ncol(mat))`.
+#' @inheritParams project
+#' @examples
+#' #######################################################################
+#' ## project(<LSI>) example 
+#' #######################################################################
+#' dim(project(lsi_result, mat))
+#' 
+#' 
+#' @export
+project.LSI <- function(x, mat, threads = 1L, ...) {
+  assert_is_mat(mat)
+  assert_is(x, "LSI")
+
+  fitted_params <- x$fitted_params
+  # Do a check to make sure that the number of rows in the matrix is the same as the number of rows in SVD$u
+  assert_true(nrow(mat) == nrow(fitted_params$feature_loadings))
+  if (!is.null(rownames(mat)) && !is.null(x$feature_names)) {
+    assert_true(all(x$feature_names %in% rownames(mat)))
+    mat <- mat[x$feature_names, ]
+  }
+  mat <- normalize_tfidf(
+    mat,
+    feature_means = fitted_params$feature_means,
+    scale_factor = fitted_params$scale_factor,
+    threads = threads
+  )
+  feature_loadings <- fitted_params$feature_loadings
+  res <- t(mat) %*% feature_loadings
+  if (length(fitted_params$pcs_to_keep) != ncol(res)) {
+    res <- res[, fitted_params$pcs_to_keep, drop = FALSE]
+  }
+  return(res)
+}
+
+
+
+#' Run Iterative LSI on a matrix.
+#' 
+#' Given a `(features x cells)` counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.  
+#' This uses the method described in [ArchR](https://doi.org/10.1038/s41588-021-00790-6) (Granja et al; 2019). 
+#' See details for more specific information.  Returns a DimReduction object, which allows for projection of matrices with the same features into the same latent space.
+#' @rdname IterativeLSI
+#' @param mat (IterableMatrix) Counts matrix of shape `(features x cells)`.
+#' @param n_iterations (int) The number of LSI iterations to perform.
+#' @param feature_selection_method (function) Method to use for selecting features for LSI. 
+#' Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_mean`, `select_features_binned_dispersion`
+#' @param cluster_method (function) Method to use for clustering the post-SVD matrix. 
+#' The user can pass in partial parameters to the cluster method, such as by passing 
+#' `cluster_cells_graph(mat, graph_to_cluster_method = cluster_graph_louvain(resolution = 0.5))` into `cluster_method`.
+#' @param threads (integer) Number of threads to use.  Also gets passed down into `feature_selection_method` and `cluster_method`
+#' @return 
+#' `IterativeLSI()` An object of class `c("IterativeLSI", "DimReduction")` with the following attributes:
+#' - `cell_embeddings`: The projected data as a matrix of shape `(cells, n_dimensions)`
+#' - `feature_names`: The names of features that `IterativeLSI()` was fit on.  
+#'   Note that projection only requires the features used in each specific iteration (as described in `iter_info`)
+#' - `fitted_params`: A list of the parameters used for iterative LSI.  Includes the following:
+#'    - `lsi_method`: The method used for LSI
+#'    - `feature_selection_method`: The method used for selecting features
+#'    - `cluster_method`: The method used for clustering
+#'    - `feature_means`: The means of the features used for tf-idf normalization
+#'    - `iterations`: The number of LSI iterations ran
+#'    - `iter_info`: A tibble of iteration info with rows as iterations. Columns include the following:
+#'        - `iteration`: The iteration number
+#'        - `feature_names`: The names of the features used for the iteration
+#'        - `feature_loadings`: SVD component `u` with dimension `(n_dimensions, n_variable_features)`
+#'        - `pcs_to_keep`: The PCs that were kept after filtering by correlation to sequencing depth
+#'        - `clusters`: The clusters for the iteration.  This is blank for the first iteration
+#' @details
+#' The Iterative LSI method is as follows:
+#' - First iteration:
+#'    - Select features using `feature_selection_method`
+#'    - Perform LSI on the selected features
+#'    - If `n_iterations` is 1, return the projected data from the first PCA projection
+#'    - Else, cluster the LSI results using `cluster_method`
+#' - For each subsequent iteration:
+#'    - Pseudobulk the clusters from the previous iteration and select the top features based on the pseudobulked clusters
+#'    - Perform LSI on the selected features
+#'    - If this is the final iteration, return the projected data from this PCA projection
+#'    - Else, cluster the LSI results using `cluster_method`
+#' 
+#' There are some minor differences when compared to the ArchR implementation:
+#' - ArchR binarizes data prior to feature selection.  To replicate this, the user can pass `select_features_variance(normalize=binarize)` for their `feature_selection_method`.
+#' - `IterativeLSI()` currently does not support utilization of different feature selection methods across each iteration. 
+#' If one desires to use a different feature selection method for each iteration, they can take the cluster assignments from the previous 
+#' iteration and use them to select features and run LSI.
+#' - ArchR uses a default of 25000 features picked during feature selection. As the number of input features is dependent on the input matrix fed into `IterativeLSI()`, 
+#' the default for `select_features_variance()` instead picks the number of variable features as a proportion of the total features provided.  To mimic the ArchR implementation,
+#' `feature_selection_method` can be set to `select_features_variance(num_feats = 25000)`.
+#' - ArchR calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
+#' which BPCells does not encounter even with a non-subsetted matrix. Therefore, IterativeLSI will run LSI on the entire matrix for each iteration.
+#' - ArchR defaults on using Seurat clustering for default, which utilizes the Louvain algorithm (See `Seurat::FindClusters()`).  In constrast, `IterativeLSI()` utilizes
+#' leiden, which should provide the same clustering results while being faster.
+#' - ArchR also plots and calculates a umap of every iteration's dimensionality reduction.  While this is not implemented in `IterativeLSI()`,
+#' one can use the `project()` method with the `iteration` argument set to the desired iteration to get projected data.  This can then be fed into `uwot::umap()`
+#' - ArchR filters out PCs with a correlation to sequencing depth greater than 0.75. 
+#' While corr_cutoff is provided as an argument in `IterativeLSI()`, it is set to not removing any PCs by default.
+#' - ArchR filters out outliers dependent on number of accesible regions of cells, by the bottom and top quantiles.  This is not implemented in `IterativeLSI()`, 
+#' but can be done as a preprocessing step.
+#' @seealso `LSI()` `DimReduction()` `svds()`
+#' `cluster_cells_graph()` `select_features_variance()` `select_features_dispersion()` 
+#' `select_features_mean()` `select_features_binned_dispersion()`
+#' @inheritParams LSI
+#' @examples
+#' ## Prep data
+#' nrows <- 350
+#' ncols <- 2000
+#' mat <- matrix(1:(nrows*ncols), nrow = nrows) %>% as("IterableMatrix")
+#' rownames(mat) <- paste0("feat", seq(nrows))
+#' colnames(mat) <- paste0("cell", seq(ncols))
+#' 
+#' 
+#' #######################################################################
+#' ## IterativeLSI() examples
+#' #######################################################################
+#' dim_reduction <- IterativeLSI(mat, n_dimensions = 5)
+#' 
+#' ## Can customize parameters using partialization
+#' dim_reduction <- IterativeLSI(
+#'   mat,
+#'   n_dimensions = 10,
+#'   feature_selection_method = select_features_variance(
+#'     num_feats = 0.5,
+#'     normalize_method = normalize_tfidf(scale_factor = 5000)
+#'   ),
+#'   cluster_method = cluster_cells_graph(
+#'     graph_to_cluster_method = cluster_graph_louvain(resolution = 0.5),
+#'     knn_to_graph_method = knn_to_snn_graph
+#'   )
+#' )
+#' dim_reduction
+#' 
+#' 
+#' @export
+IterativeLSI <- function(
+  mat, 
+  n_iterations = 2,
+  feature_selection_method = select_features_variance,
+  scale_factor = 1e4, 
+  n_dimensions = 50L, 
+  corr_cutoff = 1,
+  cluster_method = cluster_cells_graph,
+  threads = 1L, verbose = FALSE
+) {
+  assert_has_package("RcppHNSW")
+  assert_is_mat(mat)
+  assert_greater_than_zero(n_iterations)
+  assert_is_wholenumber(n_iterations)
+  assert_is_wholenumber(threads)
+  assert_is(verbose, "logical")
+  fitted_params <- list(
+    lsi_method = LSI(n_dimensions = n_dimensions, corr_cutoff = corr_cutoff, scale_factor = scale_factor, threads = threads),
+    cluster_method = cluster_method,
+    feature_selection_method = feature_selection_method,
+    feature_means = matrix_stats(mat, row_stats = "mean", threads = threads)$row_stats["mean",],
+    iterations = n_iterations,
+    iter_info = tibble::tibble(
+      iteration = integer(),
+      feature_names = list(),
+      feature_loadings = list(),
+      pcs_to_keep = list(),
+      clusters = list()
+    )
+  )
+  feature_selection_method <- partial_apply(feature_selection_method, threads = threads, .missing_args_error = FALSE)
+  if (verbose) log_progress("Starting Iterative LSI")
+  for (i in seq_len(n_iterations)) {
+    if (verbose) log_progress(sprintf("Starting Iterative LSI iteration %s of %s", i, n_iterations))
+    # run variable feature selection
+    if (verbose) log_progress("Selecting features")
+    if (i == 1) {
+      var_feature_table <- feature_selection_method(mat)
+    } else {
+      var_feature_table <- feature_selection_method(pseudobulk_res)
+    }
+    variable_features <- var_feature_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(feature)
+    if (is.character(variable_features)) {
+      mat_indices <- which(rownames(mat) %in% variable_features)
+    } else {
+      mat_indices <- variable_features
+    }
+    if (length(mat_indices) < n_dimensions) {
+      rlang::abort(
+        sprintf(paste0(
+          "Number of variable features selected (%s) is smaller than number of dimensions requested (%s). \n",
+          "Try setting the num_feats arg in feature_selection_method as a larger value."), 
+          length(mat_indices), n_dimensions
+        )
+      )
+    }
+    # run LSI
+    if (verbose) log_progress("Running LSI")
+
+    lsi_res_obj <- LSI(
+      mat = mat[mat_indices, ],
+      n_dimensions = n_dimensions,
+      corr_cutoff = corr_cutoff,
+      scale_factor = scale_factor,
+      threads = threads,
+      verbose = verbose
+    )
+    fitted_params$iter_info <- tibble::add_row(
+      fitted_params$iter_info, iteration = i,
+      feature_names = list(variable_features),
+      feature_loadings = list(lsi_res_obj$fitted_params$feature_loadings),
+      pcs_to_keep = list(lsi_res_obj$fitted_params$pcs_to_keep)
+    )
+    # only cluster + pseudobulk if this isn't the last iteration
+    if (i == n_iterations) break
+    
+    # cluster the LSI results
+    if (verbose) log_progress("Clustering LSI results")
+    clustering_res <- lsi_res_obj$cell_embeddings[, lsi_res_obj$fitted_params$pcs_to_keep, drop = FALSE] %>%
+      partial_apply(cluster_method, threads = threads, .missing_args_error = FALSE)()
+    fitted_params$iter_info$clusters[[i]] <- clustering_res
+    # pseudobulk and pass onto next iteration
+    if (verbose) log_progress("Pseudobulking matrix")
+    pseudobulk_res <- pseudobulk_matrix(mat, clustering_res, threads = threads)
+    rownames(pseudobulk_res) <- rownames(mat)
+  }
+  if (verbose) log_progress("Finished running Iterative LSI")
+  res <- DimReduction(
+    cell_embeddings = lsi_res_obj$cell_embeddings,
+    fitted_params = fitted_params,
+    feature_names = rownames(mat)
+  )
+  class(res) <- c("IterativeLSI", class(res))
+  return(res)
+}
+#' @rdname IterativeLSI
+#' @param iteration (integer) Which iteration of `IterativeLSI`'s features and loadings to use for projection.
+#' @return
+#' `project()` Matrix of the projected data of shape `(cells, n_dimensions)`.
+#' @inheritParams project
+#' @examples
+#' #######################################################################
+#' ## project(<IterativeLSI>) example
+#' #######################################################################
+#' dim(project(dim_reduction, mat))
+#' 
+#' 
+#' @export
+project.IterativeLSI <- function(x, mat, iteration = x$fitted_params$iterations, threads = 1L, ...) {
+  assert_is_mat(mat)
+  assert_is_wholenumber(threads)
+  assert_is_wholenumber(iteration)
+  fitted_params <- x$fitted_params
+  # Get the desired iteration row of iter_info tibble
+  assert_true(iteration <= x$fitted_params$iterations)
+  iter_info <- fitted_params$iter_info[iteration, ]
+
+  # Do a check to make sure that the fitted features all exist in input matrix
+  if (!is.null(rownames(mat)) && !is.null(x$feature_names)) {
+    assert_true(all(x$feature_names %in% rownames(mat)))
+  }
+  # Subset to variable features
+  if (is.character(iter_info$feature_names[[1]])) {
+    mat_indices <- which(rownames(mat) %in% iter_info$feature_names[[1]])
+  } else {
+    mat_indices <- iter_info$feature_names[[1]]
+  }
+  mat <- mat[mat_indices,]
+  # Run LSI
+  # since we don't hold the LSI object, copy the internal logic from `project.LSI()`
+  lsi_attr <- attr(x$fitted_params$lsi_method, "args")
+
+  mat <- normalize_tfidf(
+    mat = mat,
+    feature_means = fitted_params$feature_means,
+    scale_factor = lsi_attr$scale_factor,
+    threads = threads
+  )
+
+  feature_loadings <- iter_info$feature_loadings[[1]]
+  res <-  t(mat) %*% feature_loadings
+  if (length(iter_info$pcs_to_keep[[1]]) != ncol(res)) {
+    res <- res[, iter_info$pcs_to_keep[[1]], drop = FALSE]
+  }
+  return(res)
+}
+
 #' Test for marker features
 #'
 #' Given a features x cells matrix, perform one-vs-all differential
@@ -70,6 +764,7 @@ marker_features <- function(mat, groups, method="wilcoxon") {
     )
 }
 
+                   
 #' Aggregate counts matrices by cell group or feature.
 #'
 #' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each
diff --git a/r/R/transforms.R b/r/R/transforms.R
index 2a5de994..4a34ff35 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -294,16 +294,26 @@ setMethod("short_description", "TransformBinarize", function(x) {
 #'   comparison to the threshold is >= (strict_inequality=FALSE)
 #'   or > (strict_inequality=TRUE).
 #' @return binarized IterableMatrix object
+#' @examples
+#' set.seed(12345)
+#' mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
+#' mat
+#' mat <- as(mat, "IterableMatrix")
+#' 
+#' 
+#' mat_binarized <- binarize(mat, threshold=1)
+#' mat_binarized
+#' as(mat_binarized, "dgCMatrix")
 #' @export
-binarize <- function(mat, threshold=0, strict_inequality=TRUE) {
-  assert_is(mat, "IterableMatrix")
+binarize <- function(mat, threshold = 0, strict_inequality = TRUE) {
   assert_is(threshold, "numeric")
   assert_len(threshold, 1)
   assert_is(strict_inequality, "logical")
-  if (strict_inequality == TRUE && threshold < 0)
+  if (strict_inequality && threshold < 0)
       stop("binarize threshold must be greater than or equal to zero when strict_inequality is TRUE")
-  if (strict_inequality == FALSE && threshold <= 0)
+  if (!strict_inequality && threshold <= 0)
      stop("binarize threshold must be greater than zero when strict_inequality is FALSE")
+  assert_is(mat, "IterableMatrix")
 
   res <- wrapMatrix("TransformBinarize",
              convert_matrix_type(mat, "double"),
@@ -923,3 +933,107 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
     vars_to_regress = vars_to_regress
   )
 }
+
+#################
+# Normalizations
+#################
+
+#' Normalization helper functions
+#' 
+#' Apply standard normalizations to a `(features x cells)` counts matrix.
+#' 
+#' @rdname normalize
+#' @param mat (IterableMatrix) Counts matrix with dimensions `(features x cells)`.
+#' @param scale_factor (numeric) Scaling factor to multiply matrix by prior to normalization (see formulas below).
+#' @param threads (integer) Number of threads to use.
+#' @returns 
+#' Consider a matrix \eqn{X}, where the row index \eqn{i} refers to each feature
+#' and the column index \eqn{j} refers to each cell. For each element \eqn{{x}_{ij} \in X}, the 
+#' normalized value \eqn{\tilde{x}_{ij}} is calculated as:
+#'  
+#'   - `normalize_log`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
+#' @details 
+#' **Passing to `normalize` parameters with non-default arguments**
+#' 
+#' If the `mat` argument is missing, returns a "partial" function: a copy of the original function but with most arguments pre-defined. 
+#' This can be used to customize `normalize` parameters in other single cell functions in BPCells (e.g. `select_features_mean()`).
+#' 
+#' **Related functions from other packages**
+#' - `normalize_log`: Corresponds to `Seurat::NormalizeData()` with its default "LogNormalize" method.
+#' @examples
+#' set.seed(12345)
+#' mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
+#' mat
+#' 
+#' mat <- as(mat, "IterableMatrix")
+#' 
+#' 
+#' #######################################################################
+#' ## normalize_log() examples
+#' #######################################################################
+#' normalize_log(mat)
+#' 
+#' ## normalization functions can also be called with partial arguments 
+#' partial_log <- normalize_log(scale_factor = 1e5)
+#' partial_log
+#' 
+#' partial_log(mat)
+#' 
+#' 
+#' @export
+normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
+  assert_greater_than_zero(scale_factor)
+  assert_is_wholenumber(threads)
+  if (rlang::is_missing(mat)) return(create_partial())
+  assert_is_mat(mat)
+  read_depth <- matrix_stats(mat, col_stats = "mean", threads = threads)$col_stats["mean", ] * nrow(mat)
+  mat <- mat %>% multiply_cols(1 / read_depth)
+  return(log1p(mat * scale_factor))
+}
+
+
+#' @rdname normalize
+#' @param feature_means (numeric, optional) Pre-calculated means of the features to normalize by (`rowMeans(mat)` by default). 
+#' If `feature_means` has names and `mat` has row names, match values by name. 
+#' Otherwise, assume `feature_means` has the same length and ordering as the matrix rows.
+#' @returns - `normalize_tfidf`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
+#' @details - `normalize_tfidf`: This follows the formula from Stuart, Butler et al. 2019, matching the default behavior of `Signac::RunTFIDF()`. This also matches the normalization used within `ArchR::addIterativeLSI()`, but with `binarize = FALSE`. 
+#' @examples 
+#' #######################################################################
+#' ## normalize_tfidf() example
+#' #######################################################################
+#' normalize_tfidf(mat)
+#' 
+#' 
+#' @export
+normalize_tfidf <- function(
+  mat, feature_means = NULL,
+  scale_factor = 1e4, threads = 1L
+) {
+  assert_greater_than_zero(scale_factor)
+  assert_is_wholenumber(threads)
+  if (rlang::is_missing(mat)) return(create_partial())
+  assert_is_mat(mat)
+  # If feature means are passed in, only need to calculate term frequency
+  if (is.null(feature_means)) {
+    mat_stats <- matrix_stats(mat, row_stats = "mean", col_stats = "mean", threads = threads)
+    feature_means <- mat_stats$row_stats["mean", ]
+    read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
+  } else {
+    assert_is_numeric(feature_means)
+    if (!is.null(names(feature_means)) && !is.null(rownames(mat))) {
+      # Make sure every name in feature means exists in rownames(mat)
+      assert_true(all(rownames(mat) %in% names(feature_means)))
+      # subset feature_means to the rownames of mat in the case there is a length mismatch 
+      # but the feature names all exist in feature_means, 
+      feature_means <- feature_means[rownames(mat)]
+    } else {
+      assert_len(feature_means, nrow(mat))
+    }
+    read_depth <- matrix_stats(mat, col_stats = "mean", threads = threads)$col_stats["mean", ] * nrow(mat)
+  }
+  tf <- mat %>% multiply_cols(1 / read_depth)
+  idf <- 1 / feature_means
+  tf_idf_mat <- tf %>% multiply_rows(idf)
+  return(log1p(tf_idf_mat * scale_factor))
+}
\ No newline at end of file
diff --git a/r/R/utils.R b/r/R/utils.R
index 4ea62d15..b27fea00 100644
--- a/r/R/utils.R
+++ b/r/R/utils.R
@@ -45,15 +45,91 @@ document_granges <- function(
   ), intro_noun, bullets)
 }
 
+# Add current timestamp to a character string.
+add_timestamp <- function(msg) {
+  return(paste0(format(Sys.time(), "%Y-%m-%d %H:%M:%S "), msg))
+}
+
 # Function which prints a message using shell echo.
 # Useful for printing messages from inside mclapply when running in Rstudio.
 log_progress <- function(msg, add_timestamp = TRUE){
   if (add_timestamp) {
-    msg <- paste0(format(Sys.time(), "%Y-%m-%d %H:%M:%S "), msg)
+    msg <- add_timestamp(msg)
   }
   if (.Platform$GUI == "RStudio") {
     system(sprintf('echo "%s"', paste0(msg, collapse="")))
   } else {
     message(msg)
   }
-}
\ No newline at end of file
+}
+
+#' Helper to create partial functions
+#'
+#' Automatically creates a partial application of the caller
+#' function including all non-missing arguments. 
+#'
+#' @return A `bpcells_partial` object (a function with some extra attributes)
+#' @keywords internal
+create_partial <- function() {
+  env <- rlang::caller_env()
+  fn_sym <- rlang::caller_call()[[1]]
+  fn <- rlang::caller_fn()
+
+  args <- list()
+  for (n in names(formals(fn))) {
+    if (rlang::is_missing(env[[n]])) next
+    args[[n]] <- env[[n]]
+  }
+
+  ret <- do.call(partial_apply, c(fn, args))
+  attr(ret, "body")[[1]] <- fn_sym
+  return(ret)
+}
+
+
+#' Create partial function calls
+#' 
+#' Specify some but not all arguments to a function.
+#'
+#' @param f A function
+#' @param ... Named arguments to `f`
+#' @param .overwrite (bool) If `f` is already an output from
+#'   `partial_apply()`, whether parameter re-definitions should
+#'   be ignored or overwrite the existing definitions
+#' @param .missing_args_error (bool) If `TRUE`, passing in arguments
+#'  that are not in the function's signature will raise an error, otherwise
+#'  they will be ignored
+#' @return A `bpcells_partial` object (a function with some extra attributes)
+#' @keywords internal
+partial_apply <- function(f, ..., .overwrite = TRUE, .missing_args_error = TRUE) {
+  args <- rlang::list2(...)
+
+  if (is(f, "bpcells_partial")) {
+    prev_args <- attr(f, "args")
+    for (a in names(prev_args)) {
+      if (!(.overwrite && a %in% names(args))) {
+        args[[a]] <- prev_args[[a]]
+      }
+    }
+    f <- attr(f, "fn")
+    function_name <- attr(f, "body")[[1]]
+  } else {
+    function_name <- rlang::sym(rlang::caller_arg(f))
+  }
+  # See which arguments do not exist in f
+  missing_args <- which(!names(args) %in% names(formals(f)))
+  if (length(missing_args) > 0) {
+    if (.missing_args_error) {
+      stop(sprintf("Arguments %s are not in the function signature", paste0(names(args)[missing_args], collapse=", ")))
+    } else {
+      args <- args[-missing_args]}
+  }
+  partial_fn <- do.call(purrr::partial, c(f, args))
+  attr(partial_fn, "body")[[1]] <- function_name
+  structure(
+    partial_fn,
+    class = c("bpcells_partial", "purrr_function_partial", "function"),
+    args = args,
+    fn = f
+  )
+}
diff --git a/r/man/DimReduction.Rd b/r/man/DimReduction.Rd
new file mode 100644
index 00000000..17ee89ce
--- /dev/null
+++ b/r/man/DimReduction.Rd
@@ -0,0 +1,54 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{DimReduction}
+\alias{DimReduction}
+\alias{project}
+\title{Barebones definition of a DimReduction class.}
+\usage{
+DimReduction(
+  cell_embeddings,
+  fitted_params = list(),
+  feature_names = character(0),
+  ...
+)
+
+project(x, mat, ...)
+}
+\arguments{
+\item{x}{DimReduction object.}
+
+\item{(IterableMatrix)}{Input matrix of shape \verb{(features x cells)}.}
+}
+\value{
+\itemize{
+\item \code{DimReduction()}: DimReduction object.
+}
+
+\itemize{
+\item \code{project()}: IterableMatrix object of the projected data.
+}
+}
+\description{
+Represents a latent space output of a matrix after a transformation function, with any required information to reproject other inputs using this object.
+Child classes should implement a \code{project} method to allow for the projection of other matrices using
+the fitted transformation object.
+}
+\details{
+\code{project()}: Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.
+
+DimReduction subclasses should use the \code{project} method on new data with the same features, to project into the same latent space.
+All required information to run a projection should be held in \code{x$fitted_params}, including pertinent parameters when constructing the DimReduction subclass object.
+
+If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features.
+If there are rownames, reorder the matrix to match the order of the original matrix
+}
+\section{Fields}{
+
+\describe{
+\item{\code{cell_embeddings}}{(IterableMatrix, dgCMatrix, matrix) Projected data of shape \verb{(cells x n_dimensions)} of the original matrix after a dimensionality reduction.}
+
+\item{\code{fitted_params}}{(list) A list of parameters used for the transformation of a matrix.  This should include all necessary information to project new data with the same features.}
+
+\item{\code{feature_names}}{(character) The names of the features that this DimReduction object was fit on.  Matrices to be projected should have the same feature names.}
+}}
+
diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd
new file mode 100644
index 00000000..0071ce71
--- /dev/null
+++ b/r/man/IterativeLSI.Rd
@@ -0,0 +1,159 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{IterativeLSI}
+\alias{IterativeLSI}
+\alias{project.IterativeLSI}
+\title{Run Iterative LSI on a matrix.}
+\usage{
+IterativeLSI(
+  mat,
+  n_iterations = 2,
+  feature_selection_method = select_features_variance,
+  scale_factor = 10000,
+  n_dimensions = 50L,
+  corr_cutoff = 1,
+  cluster_method = cluster_cells_graph,
+  threads = 1L,
+  verbose = FALSE
+)
+
+\method{project}{IterativeLSI}(x, mat, iteration = x$fitted_params$iterations, threads = 1L, ...)
+}
+\arguments{
+\item{mat}{(IterableMatrix) Counts matrix of shape \verb{(features x cells)}.}
+
+\item{n_iterations}{(int) The number of LSI iterations to perform.}
+
+\item{feature_selection_method}{(function) Method to use for selecting features for LSI.
+Current builtin options are \code{select_features_variance}, \code{select_features_dispersion}, \code{select_features_mean}, \code{select_features_binned_dispersion}}
+
+\item{scale_factor}{(numeric) Scaling factor to multiply matrix by during tf-idf normalization.}
+
+\item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
+
+\item{corr_cutoff}{(numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is greater or equal to
+the corr_cutoff, it will be excluded from the final PCA matrix.}
+
+\item{cluster_method}{(function) Method to use for clustering the post-SVD matrix.
+The user can pass in partial parameters to the cluster method, such as by passing
+\code{cluster_cells_graph(mat, graph_to_cluster_method = cluster_graph_louvain(resolution = 0.5))} into \code{cluster_method}.}
+
+\item{threads}{(integer) Number of threads to use.  Also gets passed down into \code{feature_selection_method} and \code{cluster_method}}
+
+\item{x}{DimReduction object.}
+
+\item{iteration}{(integer) Which iteration of \code{IterativeLSI}'s features and loadings to use for projection.}
+}
+\value{
+\code{IterativeLSI()} An object of class \code{c("IterativeLSI", "DimReduction")} with the following attributes:
+\itemize{
+\item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(cells, n_dimensions)}
+\item \code{feature_names}: The names of features that \code{IterativeLSI()} was fit on.
+Note that projection only requires the features used in each specific iteration (as described in \code{iter_info})
+\item \code{fitted_params}: A list of the parameters used for iterative LSI.  Includes the following:
+\itemize{
+\item \code{lsi_method}: The method used for LSI
+\item \code{feature_selection_method}: The method used for selecting features
+\item \code{cluster_method}: The method used for clustering
+\item \code{feature_means}: The means of the features used for tf-idf normalization
+\item \code{iterations}: The number of LSI iterations ran
+\item \code{iter_info}: A tibble of iteration info with rows as iterations. Columns include the following:
+\itemize{
+\item \code{iteration}: The iteration number
+\item \code{feature_names}: The names of the features used for the iteration
+\item \code{feature_loadings}: SVD component \code{u} with dimension \verb{(n_dimensions, n_variable_features)}
+\item \code{pcs_to_keep}: The PCs that were kept after filtering by correlation to sequencing depth
+\item \code{clusters}: The clusters for the iteration.  This is blank for the first iteration
+}
+}
+}
+
+\code{project()} Matrix of the projected data of shape \verb{(cells, n_dimensions)}.
+}
+\description{
+Given a \verb{(features x cells)} counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
+This uses the method described in \href{https://doi.org/10.1038/s41588-021-00790-6}{ArchR} (Granja et al; 2019).
+See details for more specific information.  Returns a DimReduction object, which allows for projection of matrices with the same features into the same latent space.
+}
+\details{
+The Iterative LSI method is as follows:
+\itemize{
+\item First iteration:
+\itemize{
+\item Select features using \code{feature_selection_method}
+\item Perform LSI on the selected features
+\item If \code{n_iterations} is 1, return the projected data from the first PCA projection
+\item Else, cluster the LSI results using \code{cluster_method}
+}
+\item For each subsequent iteration:
+\itemize{
+\item Pseudobulk the clusters from the previous iteration and select the top features based on the pseudobulked clusters
+\item Perform LSI on the selected features
+\item If this is the final iteration, return the projected data from this PCA projection
+\item Else, cluster the LSI results using \code{cluster_method}
+}
+}
+
+There are some minor differences when compared to the ArchR implementation:
+\itemize{
+\item ArchR binarizes data prior to feature selection.  To replicate this, the user can pass \code{select_features_variance(normalize=binarize)} for their \code{feature_selection_method}.
+\item \code{IterativeLSI()} currently does not support utilization of different feature selection methods across each iteration.
+If one desires to use a different feature selection method for each iteration, they can take the cluster assignments from the previous
+iteration and use them to select features and run LSI.
+\item ArchR uses a default of 25000 features picked during feature selection. As the number of input features is dependent on the input matrix fed into \code{IterativeLSI()},
+the default for \code{select_features_variance()} instead picks the number of variable features as a proportion of the total features provided.  To mimic the ArchR implementation,
+\code{feature_selection_method} can be set to \code{select_features_variance(num_feats = 25000)}.
+\item ArchR calculates LSI during non-terminal iterations using a default subset of 10000 cells.  ArchR does this to prevent a memory bottleneck,
+which BPCells does not encounter even with a non-subsetted matrix. Therefore, IterativeLSI will run LSI on the entire matrix for each iteration.
+\item ArchR defaults on using Seurat clustering for default, which utilizes the Louvain algorithm (See \code{Seurat::FindClusters()}).  In constrast, \code{IterativeLSI()} utilizes
+leiden, which should provide the same clustering results while being faster.
+\item ArchR also plots and calculates a umap of every iteration's dimensionality reduction.  While this is not implemented in \code{IterativeLSI()},
+one can use the \code{project()} method with the \code{iteration} argument set to the desired iteration to get projected data.  This can then be fed into \code{uwot::umap()}
+\item ArchR filters out PCs with a correlation to sequencing depth greater than 0.75.
+While corr_cutoff is provided as an argument in \code{IterativeLSI()}, it is set to not removing any PCs by default.
+\item ArchR filters out outliers dependent on number of accesible regions of cells, by the bottom and top quantiles.  This is not implemented in \code{IterativeLSI()},
+but can be done as a preprocessing step.
+}
+}
+\examples{
+## Prep data
+nrows <- 350
+ncols <- 2000
+mat <- matrix(1:(nrows*ncols), nrow = nrows) \%>\% as("IterableMatrix")
+rownames(mat) <- paste0("feat", seq(nrows))
+colnames(mat) <- paste0("cell", seq(ncols))
+
+
+#######################################################################
+## IterativeLSI() examples
+#######################################################################
+dim_reduction <- IterativeLSI(mat, n_dimensions = 5)
+
+## Can customize parameters using partialization
+dim_reduction <- IterativeLSI(
+  mat,
+  n_dimensions = 10,
+  feature_selection_method = select_features_variance(
+    num_feats = 0.5,
+    normalize_method = normalize_tfidf(scale_factor = 5000)
+  ),
+  cluster_method = cluster_cells_graph(
+    graph_to_cluster_method = cluster_graph_louvain(resolution = 0.5),
+    knn_to_graph_method = knn_to_snn_graph
+  )
+)
+dim_reduction
+
+
+#######################################################################
+## project(<IterativeLSI>) example
+#######################################################################
+dim(project(dim_reduction, mat))
+
+
+}
+\seealso{
+\code{LSI()} \code{DimReduction()} \code{svds()}
+\code{cluster_cells_graph()} \code{select_features_variance()} \code{select_features_dispersion()}
+\code{select_features_mean()} \code{select_features_binned_dispersion()}
+}
diff --git a/r/man/LSI.Rd b/r/man/LSI.Rd
new file mode 100644
index 00000000..bb28f17f
--- /dev/null
+++ b/r/man/LSI.Rd
@@ -0,0 +1,86 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{LSI}
+\alias{LSI}
+\alias{project.LSI}
+\title{Perform latent semantic indexing (LSI) on a matrix.}
+\usage{
+LSI(
+  mat,
+  n_dimensions = 50L,
+  corr_cutoff = 1,
+  scale_factor = 10000,
+  threads = 1L,
+  verbose = FALSE
+)
+
+\method{project}{LSI}(x, mat, threads = 1L, ...)
+}
+\arguments{
+\item{mat}{(IterableMatrix) Counts matrix of shape \verb{(features x cells)}.}
+
+\item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
+
+\item{corr_cutoff}{(numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is greater or equal to
+the corr_cutoff, it will be excluded from the final PCA matrix.}
+
+\item{scale_factor}{(numeric) Scaling factor to multiply matrix by during tf-idf normalization.}
+
+\item{threads}{(integer) Number of threads to use.}
+
+\item{x}{DimReduction object.}
+}
+\value{
+\code{LSI()} An object of class \code{c("LSI", "DimReduction")} with the following attributes:
+\itemize{
+\item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(cells, n_dimensions)}
+\item \code{fitted_params}: A named list of the parameters used for LSI, including the following:
+\itemize{
+\item \code{scale_factor}: The scale factor used for tf-idf normalization
+\item \code{feature_means}: The means of the features used for normalization
+\item \code{pcs_to_keep}: The PCs that were kept after filtering by correlation to sequencing depth
+\item \code{feature_loadings}: SVD component with dimension \verb{(n_variable_features, n_dimensions)} used to linearly transform input data into cell embeddings
+}
+\item \code{feature_names}: The names of the features in the matrix
+}
+
+\code{project()} IterableMatrix of the projected data of shape \verb{(n_dimensions, ncol(mat))}.
+}
+\description{
+Given a \verb{(features x cells)} counts matrix, perform LSI, which sequentially executes tf-idf normalization and PCA to create a latent space representation
+of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
+Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space.
+}
+\details{
+Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
+
+Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
+\itemize{
+\item 17.1 MB memory usage, 25.1 seconds runtime
+}
+}
+\examples{
+nrows <- 50
+ncols <- 1000
+mat <- matrix(1:(nrows*ncols), nrow = nrows) \%>\% as("IterableMatrix")
+rownames(mat) <- paste0("feat", seq(nrows))
+colnames(mat) <- paste0("cell", seq(ncols))
+
+
+#######################################################################
+## LSI() example 
+#######################################################################
+lsi_result <- LSI(mat, n_dimensions = 10)
+lsi_result
+
+
+#######################################################################
+## project(<LSI>) example 
+#######################################################################
+dim(project(lsi_result, mat))
+
+
+}
+\seealso{
+\code{project()} \code{DimReduction()} \code{normalize_tfidf()} \code{normalize_log()} \code{svds()}
+}
diff --git a/r/man/binarize.Rd b/r/man/binarize.Rd
index c162a8c8..acb9fefb 100644
--- a/r/man/binarize.Rd
+++ b/r/man/binarize.Rd
@@ -28,3 +28,14 @@ is set to FALSE, element values greater than or equal to the
 threshold are set to one. As an alternative, the \code{<}, \code{<=}, \code{>},
 and \code{>=} operators are also supported.
 }
+\examples{
+set.seed(12345)
+mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
+mat
+mat <- as(mat, "IterableMatrix")
+
+
+mat_binarized <- binarize(mat, threshold=1)
+mat_binarized
+as(mat_binarized, "dgCMatrix")
+}
diff --git a/r/man/call_macs_peaks.Rd b/r/man/call_macs_peaks.Rd
new file mode 100644
index 00000000..5ad425c5
--- /dev/null
+++ b/r/man/call_macs_peaks.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/atac_utils.R
+\name{call_macs_peaks}
+\alias{call_macs_peaks}
+\title{Call peaks using MACS2/3}
+\usage{
+call_macs_peaks(...)
+}
+\description{
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}
+
+This function has been renamed to \code{call_peaks_macs()}
+}
+\keyword{internal}
diff --git a/r/man/cluster_cells_graph.Rd b/r/man/cluster_cells_graph.Rd
new file mode 100644
index 00000000..b99ff5d6
--- /dev/null
+++ b/r/man/cluster_cells_graph.Rd
@@ -0,0 +1,59 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/clustering.R
+\name{cluster_cells_graph}
+\alias{cluster_cells_graph}
+\title{Cluster embeddings using a KNN-Graph based algorithm}
+\usage{
+cluster_cells_graph(
+  mat,
+  knn_method = knn_hnsw,
+  knn_to_graph_method = knn_to_geodesic_graph,
+  graph_to_cluster_method = cluster_graph_leiden,
+  threads = 0L,
+  verbose = FALSE
+)
+}
+\arguments{
+\item{mat}{(matrix) Cell embeddings matrix of shape \verb{(cells x n_embeddings)}}
+
+\item{knn_method}{(function) Function to convert cell embeddings into a knn object.
+Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.}
+
+\item{knn_to_graph_method}{(function) Function to convert the knn object returned from \code{knn_method} to a graph adjacency matrix.
+Must be a (optionally partialized) version of \code{knn_to_graph()}, \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}.}
+
+\item{graph_to_cluster_method}{(function) Clustering algorithm that converts a graph adjacency matrix
+returned from \code{graph_to_cluster_method} into cluster labels for each cell.
+Must be a (optionally partialized) version of \code{cluster_graph_leiden()}, \code{cluster_graph_louvain()} or \code{cluster_graph_seurat()}.}
+
+\item{threads}{(integer) Number of threads to use in \code{knn_method}, \code{knn_to_graph_method} and \code{graph_to_cluster_method}.  If these functions do not utilize
+a \code{threads} argument, this is silently ignored.}
+
+\item{verbose}{(logical) Whether to print progress information in \code{knn_method}, \code{knn_to_graph_method} and \code{graph_to_cluster_method}.  If these functions do not utilize
+a \code{verbose} argument, this is silently ignored.}
+}
+\value{
+(factor) Factor vector containing the cluster assignment for each cell.
+}
+\description{
+Take in a cell embedding matrix, and sequentially convert it into a kNN object, then to
+a graph adjacency matrix.  Following, assign a label to every cell using a clustering algorithm.
+}
+\details{
+\code{cluster_cells_graph()} acts as a helper function to wrap input creation and \code{kNN} graph adjacency-based clustering to be done together.  The user
+can also manually pass cell embeddings to their preferred knn/clustering functions of choices.
+
+\strong{Clustering customization through partialized parameters}
+
+Customization of clustering is possible through partialization of each parameter in \code{cluster_cells_graph()} that is a function.
+In detail, each parameter that requests a function
+may take in one with only some of the arguments provided.  If the first argument is not provided, a copy of a function is utilized that has its parameters
+changed with the arguments provided.
+
+For instance, if the user desires for \code{cluster_cells_graph()} to instead use \code{cluster_graph_louvain()} with resolution different than the default,
+they can instead call \code{cluster_cells_graph()} like so:
+\code{cluster_cells_graph(mat, graph_to_cluster_method = cluter_graph_louvain(resolution = 0.5))}
+}
+\seealso{
+\code{knn_hnsw()} \code{knn_annoy()} \code{knn_to_graph()} \code{knn_to_snn_graph()} \code{knn_to_geodesic_graph()} \code{cluster_graph_leiden()} \code{knn_to_snn_graph()} \code{knn_to_geodesic_graph()}
+}
diff --git a/r/man/cluster.Rd b/r/man/cluster_graph.Rd
similarity index 88%
rename from r/man/cluster.Rd
rename to r/man/cluster_graph.Rd
index ce47c1db..0794ca6c 100644
--- a/r/man/cluster.Rd
+++ b/r/man/cluster_graph.Rd
@@ -7,19 +7,19 @@
 \title{Cluster an adjacency matrix}
 \usage{
 cluster_graph_leiden(
-  snn,
+  mat,
   resolution = 1,
   objective_function = c("modularity", "CPM"),
   seed = 12531,
   ...
 )
 
-cluster_graph_louvain(snn, resolution = 1, seed = 12531)
+cluster_graph_louvain(mat, resolution = 1, seed = 12531)
 
-cluster_graph_seurat(snn, resolution = 0.8, ...)
+cluster_graph_seurat(mat, resolution = 0.8, ...)
 }
 \arguments{
-\item{snn}{Symmetric adjacency matrix (dgCMatrix) output from e.g. \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}. Only the lower triangle is used}
+\item{mat}{Symmetric adjacency matrix (dgCMatrix) output from e.g. \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}. Only the lower triangle is used.}
 
 \item{resolution}{Resolution parameter. Higher values result in more clusters}
 
diff --git a/r/man/create_partial.Rd b/r/man/create_partial.Rd
new file mode 100644
index 00000000..b8bc6939
--- /dev/null
+++ b/r/man/create_partial.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{create_partial}
+\alias{create_partial}
+\title{Helper to create partial functions}
+\usage{
+create_partial()
+}
+\value{
+A \code{bpcells_partial} object (a function with some extra attributes)
+}
+\description{
+Automatically creates a partial application of the caller
+function including all non-missing arguments.
+}
+\keyword{internal}
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
new file mode 100644
index 00000000..bfb9ebb3
--- /dev/null
+++ b/r/man/feature_selection.Rd
@@ -0,0 +1,159 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{select_features_variance}
+\alias{select_features_variance}
+\alias{select_features_dispersion}
+\alias{select_features_mean}
+\alias{select_features_binned_dispersion}
+\title{Feature selection functions}
+\usage{
+select_features_variance(
+  mat,
+  num_feats = 0.05,
+  normalize_method = NULL,
+  threads = 1L
+)
+
+select_features_dispersion(
+  mat,
+  num_feats = 0.05,
+  normalize_method = NULL,
+  threads = 1L
+)
+
+select_features_mean(
+  mat,
+  num_feats = 0.05,
+  normalize_method = NULL,
+  threads = 1L
+)
+
+select_features_binned_dispersion(
+  mat,
+  num_feats = 0.05,
+  n_bins = 20,
+  threads = 1L
+)
+}
+\arguments{
+\item{mat}{(IterableMatrix) Counts matrix with dimensions \verb{(features x cells)}.}
+
+\item{num_feats}{(float) Number of features to mark as highly_variable. If 0 < \code{num_feats} < 1, then interpret as a fraction of features.}
+
+\item{normalize_method}{(function) Used to normalize the matrix prior to feature selection by calling \code{normalize_method(mat)} if it is not \code{NULL}.
+For example, pass \code{normalize_log()} or \code{normalize_tfidf()}.}
+
+\item{threads}{(integer) Number of threads to use. Also overrides the threads argument in \code{normalize_method}}
+
+\item{n_bins}{(integer) Number of bins to split features into in order to control for the relationship between mean expression and dispersion (see details).}
+}
+\value{
+Return a dataframe with the following columns:
+\itemize{
+\item \code{feature}: Feature name.
+\item \code{score}: Scoring of the feature, depending on the method used.
+\item \code{highly_variable}: Logical vector of whether the feature is highly variable.
+}
+
+Each different feature selection method will have a different scoring method.
+Consider a matrix \eqn{X}, where the row index \eqn{i} refers to each feature
+and the column index \eqn{j} refers to each cell. For each feature \eqn{x_{i} \in X}, we define the following feature-selection scores:
+\itemize{
+\item \code{select_features_variance}: \eqn{\mathrm{Score}(x_i) = \frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}
+}
+
+\itemize{
+\item \code{select_features_dispersion}: \eqn{\mathrm{Score}(x_i) = \frac{\frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}{\bar{x}_i}}
+}
+
+\itemize{
+\item \code{select_features_mean}: \eqn{\mathrm{Score}(x_i) = \frac{\sum_{j=1}^{n}\bigl(x_{ij}\bigr)}{n}}
+}
+
+\itemize{
+\item \code{select_features_binned_dispersion}: Process described in \code{details}.
+}
+}
+\description{
+Apply a feature selection method to a non-normalized \verb{(features x cells)} matrix.  We recommend using counts matrices as input and to
+apply any normalizations prior to feature selection via the normalize argument (if available).
+Instead of directly subsetting the input matrix,
+an output dataframe is returned, indicating which features are highly variable, and the scoring of each feature.
+}
+\details{
+\code{select_features_binned_dispersion} implements the approach from Satija et al. 2015:
+\enumerate{
+\item Bin features into equal-width bins by \code{log1p(mean)}
+\item Calculate dispersion of each feature as \code{log(variance / mean)}
+\item Z-score normalize dispersion within each bin, and select highest normalized dispersion across all bins
+}
+
+If the number of features within a bin is equal to 1, then the mean dispersion for that bin is set to 1.
+
+This should be equivalent to \code{Seurat::FindVariableFeatures()} with \code{selection.method="mean.var.plot"}
+and \code{scanpy.pp.highly_variable_genes()} with \code{flavor="seurat"}.
+}
+\examples{
+## Prep data
+set.seed(12345)
+mat <- matrix(rpois(5*8, lambda=1), nrow=5, ncol=8)
+rownames(mat) <- paste0("gene", seq_len(nrow(mat)))
+mat
+mat <- as(mat, "IterableMatrix")
+
+
+#######################################################################
+## select_features_variance() examples
+#######################################################################
+select_features_variance(
+    mat, 
+    num_feats=2, 
+    normalize_method=normalize_log
+)
+
+# Because of how the BPCells `normalize` functions behave when the matrix 
+# argument is missing, we can also customize the normalization parameters 
+# using partial arguments:
+variable_features <- select_features_variance(
+    mat,
+    num_feats=2,
+    normalize_method=normalize_log(scale_factor=20)
+) 
+# One can then filter to only variable features using the subset operator:
+mat[variable_features$feature[variable_features$highly_variable],]
+
+
+#######################################################################
+## select_features_dispersion() example
+#######################################################################
+select_features_dispersion(
+  mat,
+  num_feats = 2,
+  normalize_method = normalize_log
+)
+
+
+#######################################################################
+## select_features_mean() example
+#######################################################################
+select_features_mean(
+  mat,
+  num_feats = 1,
+  normalize_method = normalize_log
+)
+
+
+#######################################################################
+## select_features_binned_dispersion() example
+#######################################################################
+select_features_binned_dispersion(
+  mat,
+  num_feats = 2,
+  n_bins = 2
+)
+
+
+}
+\seealso{
+\code{normalize_tfidf()} \code{normalize_log()}
+}
diff --git a/r/man/is_adjacency_matrix.Rd b/r/man/is_adjacency_matrix.Rd
new file mode 100644
index 00000000..c44ce669
--- /dev/null
+++ b/r/man/is_adjacency_matrix.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/clustering.R
+\name{is_adjacency_matrix}
+\alias{is_adjacency_matrix}
+\title{Check if an input is a graph adjacency matrix.}
+\usage{
+is_adjacency_matrix(mat)
+}
+\value{
+TRUE if the mat is a graph adjacency matrix, FALSE otherwise
+}
+\description{
+Clustering functions like \code{cluster_graph_leiden()} and \code{cluster_graph_louvain()} require a graph adjacency matrix as input.
+We assume that any square \code{dgCMatrix} is a graph adjacency matrix.
+}
+\keyword{internal}
diff --git a/r/man/is_knn_object.Rd b/r/man/is_knn_object.Rd
new file mode 100644
index 00000000..5713e23f
--- /dev/null
+++ b/r/man/is_knn_object.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/clustering.R
+\name{is_knn_object}
+\alias{is_knn_object}
+\title{Check if an input is a kNN object}
+\usage{
+is_knn_object(mat)
+}
+\value{
+TRUE if the mat is a knn object, FALSE otherwise
+}
+\description{
+knn object functions \code{knn_hnsw()} and \code{knn_annoy()} return a list of two matrices, \code{idx} and \code{dist}.
+These are used as inputs to create graph adjacency matrices for clustering.
+Assume any list with at least both \code{idx} and \code{dist} items is a kNN object.
+}
+\keyword{internal}
diff --git a/r/man/knn.Rd b/r/man/knn.Rd
index 3efbc9e7..7944fc7a 100644
--- a/r/man/knn.Rd
+++ b/r/man/knn.Rd
@@ -3,7 +3,7 @@
 \name{knn_hnsw}
 \alias{knn_hnsw}
 \alias{knn_annoy}
-\title{Get a knn matrix from reduced dimensions}
+\title{Get a knn object from reduced dimensions}
 \usage{
 knn_hnsw(
   data,
@@ -17,7 +17,7 @@ knn_hnsw(
 
 knn_annoy(
   data,
-  query = data,
+  query = NULL,
   k = 10,
   metric = c("euclidean", "cosine", "manhattan", "hamming"),
   n_trees = 50,
@@ -38,7 +38,7 @@ knn_annoy(
 \item{threads}{Number of threads to use. Note that result is non-deterministic
 if threads > 1}
 
-\item{ef}{ef parameter for RccppHNSW::hnsw_search. Increase for slower search but
+\item{ef}{ef parameter for \code{RcppHNSW::hnsw_search()}. Increase for slower search but
 improved accuracy}
 
 \item{n_trees}{Number of trees during index build time. More trees gives higher accuracy}
diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd
new file mode 100644
index 00000000..c270cb09
--- /dev/null
+++ b/r/man/normalize.Rd
@@ -0,0 +1,79 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transforms.R
+\name{normalize_log}
+\alias{normalize_log}
+\alias{normalize_tfidf}
+\title{Normalization helper functions}
+\usage{
+normalize_log(mat, scale_factor = 10000, threads = 1L)
+
+normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L)
+}
+\arguments{
+\item{mat}{(IterableMatrix) Counts matrix with dimensions \verb{(features x cells)}.}
+
+\item{scale_factor}{(numeric) Scaling factor to multiply matrix by prior to normalization (see formulas below).}
+
+\item{threads}{(integer) Number of threads to use.}
+
+\item{feature_means}{(numeric, optional) Pre-calculated means of the features to normalize by (\code{rowMeans(mat)} by default).
+If \code{feature_means} has names and \code{mat} has row names, match values by name.
+Otherwise, assume \code{feature_means} has the same length and ordering as the matrix rows.}
+}
+\value{
+Consider a matrix \eqn{X}, where the row index \eqn{i} refers to each feature
+and the column index \eqn{j} refers to each cell. For each element \eqn{{x}_{ij} \in X}, the
+normalized value \eqn{\tilde{x}_{ij}} is calculated as:
+\itemize{
+\item \code{normalize_log}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
+}
+
+\itemize{
+\item \code{normalize_tfidf}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
+}
+}
+\description{
+Apply standard normalizations to a \verb{(features x cells)} counts matrix.
+}
+\details{
+\strong{Passing to \code{normalize} parameters with non-default arguments}
+
+If the \code{mat} argument is missing, returns a "partial" function: a copy of the original function but with most arguments pre-defined.
+This can be used to customize \code{normalize} parameters in other single cell functions in BPCells (e.g. \code{select_features_mean()}).
+
+\strong{Related functions from other packages}
+\itemize{
+\item \code{normalize_log}: Corresponds to \code{Seurat::NormalizeData()} with its default "LogNormalize" method.
+}
+
+\itemize{
+\item \code{normalize_tfidf}: This follows the formula from Stuart, Butler et al. 2019, matching the default behavior of \code{Signac::RunTFIDF()}. This also matches the normalization used within \code{ArchR::addIterativeLSI()}, but with \code{binarize = FALSE}.
+}
+}
+\examples{
+set.seed(12345)
+mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5)
+mat
+
+mat <- as(mat, "IterableMatrix")
+
+
+#######################################################################
+## normalize_log() examples
+#######################################################################
+normalize_log(mat)
+
+## normalization functions can also be called with partial arguments 
+partial_log <- normalize_log(scale_factor = 1e5)
+partial_log
+
+partial_log(mat)
+
+
+#######################################################################
+## normalize_tfidf() example
+#######################################################################
+normalize_tfidf(mat)
+
+
+}
diff --git a/r/man/partial_apply.Rd b/r/man/partial_apply.Rd
new file mode 100644
index 00000000..149eb19f
--- /dev/null
+++ b/r/man/partial_apply.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{partial_apply}
+\alias{partial_apply}
+\title{Create partial function calls}
+\usage{
+partial_apply(f, ..., .overwrite = TRUE, .missing_args_error = TRUE)
+}
+\arguments{
+\item{f}{A function}
+
+\item{...}{Named arguments to \code{f}}
+
+\item{.overwrite}{(bool) If \code{f} is already an output from
+\code{partial_apply()}, whether parameter re-definitions should
+be ignored or overwrite the existing definitions}
+
+\item{.missing_args_error}{(bool) If \code{TRUE}, passing in arguments
+that are not in the function's signature will raise an error, otherwise
+they will be ignored}
+}
+\value{
+A \code{bpcells_partial} object (a function with some extra attributes)
+}
+\description{
+Specify some but not all arguments to a function.
+}
+\keyword{internal}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 64d16f01..612d817f 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -9,6 +9,11 @@ template:
   includes:
     in_header: |
        <script defer data-domain="benparks.net" src="https://plausible.benparks.net/js/visit-counts.js"></script>
+       <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.15/dist/katex.min.css" integrity="sha384-Htz9HMhiwV8GuQ28Xr9pEs1B4qJiYu/nYLLwlDklR53QibDfmQzi7rYxXhMH/5/u" crossorigin="anonymous">
+       <!-- The loading of KaTeX is deferred to speed up page rendering -->
+       <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.15/dist/katex.min.js" integrity="sha384-bxmi2jLGCvnsEqMuYLKE/KsVCxV3PqmKeK6Y6+lmNXBry6+luFkEOsmp5vD9I/7+" crossorigin="anonymous"></script>
+       <!-- To automatically render math in text elements, include the auto-render extension: -->
+       <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.15/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous" onload="renderMathInElement(document.body);"></script>
     after_body: |
        <img src="https://plausible.benparks.net/flask-plausible/bpcells-docs.png" style="position:absolute;" />
 
@@ -130,20 +135,24 @@ reference:
   - IterableMatrix-methods
   - pseudobulk_matrix
 
-- title: "Reference Annotations"
+- title: "Single-cell analysis helpers"
+- subtitle: "Normalization and Feature Selection"
 - contents:
-  - human_gene_mapping
-  - match_gene_symbol
-  - read_gtf
-  - read_bed
-  - read_ucsc_chrom_sizes
-
-- title: "Clustering"
+  - normalize_log
+  - select_features_variance
+- subtitle: "Clustering"
 - contents:
+  - cluster_cells_graph
   - knn_hnsw
   - cluster_graph_leiden
   - knn_to_graph
   - cluster_membership_matrix
+- subtitle: "Dimensionality Reductions"
+- contents: 
+  - DimReduction
+  - project
+  - LSI
+  - IterativeLSI
 
 
 - title: "Plots"
@@ -176,3 +185,11 @@ reference:
   - discrete_palette
   - collect_features
   - rotate_x_labels
+
+- title: "Reference Annotations"
+- contents:
+  - human_gene_mapping
+  - match_gene_symbol
+  - read_gtf
+  - read_bed
+  - read_ucsc_chrom_sizes
diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
new file mode 100644
index 00000000..de775f4f
--- /dev/null
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -0,0 +1,69 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+library("BPCells")
+library("ArchR")
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
+
+#' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells.
+#' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR.
+#' As the SVD implementation is not necessarily the same between the two packages, we take the SVD matrix
+#' from both functions and compare the matrix multiplication of the U and SVD matrices, which should give an approximation
+#' we can compare between the two packages.
+#' @param proj An archr project.
+test_lsi_similarity_to_archr <- function(dir = NULL) {
+    dir <- create_temp_dir(dir)
+    setwd(dir)
+    # add iterative lsi for dim reduction
+    proj <- getTestProject()
+    proj <- addPeakMatrix(proj)
+    # Get the peak matrix
+    test_mat <- assay(getMatrixFromProject(proj, useMatrix = "PeakMatrix"))
+    # Calculate LSI on ArchR
+    # running LSI without binarizing, as we don't do this in the BPCells implementation
+    # we also don't filter quantile outliers.
+    lsi_archr <- ArchR:::.computeLSI(
+        mat = test_mat,
+        LSIMethod = 2,
+        nDimensions = 20,
+        binarize = FALSE,
+        outlierQuantiles = NULL
+    )
+    svd_archr <- lsi_archr$svd
+    lsi_mat_archr <- t(lsi_archr$matSVD)
+    # set rownames to NA, as we don't have rownames in the BPCells implementation
+    rownames(lsi_mat_archr) <- NULL
+    # PCA Matrix = T(u) * Pre-SVD Matrix
+    # u * PCA Matrix = u * T(u) * Pre-SVD Matrix
+    # u * PCA Matrix = Pre-SVD Matrix
+    pre_svd_mat_approx_archr <- lsi_archr$svd$u %*% lsi_mat_archr
+    # Calculate LSI on BPCells
+    # Do not use z-score normalization, as this isn't done with ArchR
+    lsi_bpcells <- LSI(
+        test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"),
+        n_dimensions = 20
+    )
+    pre_svd_mat_approx_bpcells <- lsi_bpcells$fitted_params$svd_params$u %*% lsi_bpcells$cell_embeddings
+    testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-4))
+    # convert signs
+    lsi_mat_archr <- sweep(lsi_mat_archr, MARGIN = 1,  (2 * (lsi_mat_archr[,1] * lsi_bpcells$cell_embeddings[,1] > 0) - 1), `*`)
+    # Check for post-pca matrix similarity
+    testthat::expect_true(all.equal(lsi_mat_archr, lsi_bpcells$cell_embeddings, tolerance = 1e-4))
+    # also check for correlation between the two matrices in PC space
+    testthat::expect_true(cor(as.vector(lsi_mat_archr), as.vector(lsi_bpcells$cell_embeddings)) > 0.999)
+}
+test_lsi_similarity_to_archr()
\ No newline at end of file
diff --git a/r/tests/real_data/ArchR_insertions.R b/r/tests/real_data/ArchR_insertions.R
index 150939d4..8818d7e7 100644
--- a/r/tests/real_data/ArchR_insertions.R
+++ b/r/tests/real_data/ArchR_insertions.R
@@ -1,5 +1,23 @@
-devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/")
-devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/")
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r")
+devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/ArchR")
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
 
 fix_granges_syntax_for_archr <- function(gr) {
     mcols(gr)$RG <- gsub("PBSmall#", "", mcols(gr)$RG)
diff --git a/r/tests/real_data/Scanpy_variable_feat_selection.py b/r/tests/real_data/Scanpy_variable_feat_selection.py
new file mode 100644
index 00000000..6ae28c1b
--- /dev/null
+++ b/r/tests/real_data/Scanpy_variable_feat_selection.py
@@ -0,0 +1,46 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+import sys, tempfile, os
+import numpy as np
+import pandas as pd
+import scanpy as sc
+
+
+def call_highly_var_genes_single_batch(temp_dir: str) -> None:
+    """
+    Call highly_variable genes on a single batch of PBMC68k data using their interpreation of 
+    the Seurat implementation.
+    Write the input anndata object csv at `<temp_dir>/highly_var_genes_scanpy_input.csv`
+    Write the output as a csv, at `<temp_dir>/highly_var_genes_scanpy_output.csv`
+
+    Args:
+        temp_dir (str): Path to the temporary directory to write the input and output files.
+    """
+    # Dataset is only (765, 700)
+    adata = sc.datasets.pbmc68k_reduced()
+    adata.var_names_make_unique()
+    res = sc.pp._highly_variable_genes.highly_variable_genes(adata, 
+                                                             n_top_genes = 50,
+                                                             n_bins = 20, 
+                                                             flavor = "seurat",
+                                                             inplace = False,
+                                                             check_values = False).drop(columns = 'mean_bin')
+    # remove mean_bin
+    adata.to_df().to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_input.csv"))
+    res.to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_output.csv"))
+
+
+if __name__ == "__main__":
+    # We use the first argument as the temporary directory
+    if len(sys.argv) < 2:
+        # If no argument is provided, use the current directory
+        call_highly_var_genes_single_batch(".")
+    else:
+        call_highly_var_genes_single_batch(sys.argv[1])
+    
\ No newline at end of file
diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R
new file mode 100644
index 00000000..d2eb08b2
--- /dev/null
+++ b/r/tests/real_data/scanpy_variable_feat_selection.R
@@ -0,0 +1,58 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+library("BPCells")
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
+
+# Compare the feature selection output of BPCells to that of Scanpy.
+# Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. 
+# This function calls a python script that runs Scanpy feature selection on a test dataset, and writes both input/output to `dir`.
+# It then reads in the input/output from the python script, calls the BPCells feature selection function, and compares the output to the Scanpy output.
+compare_feat_selection_to_scanpy <- function(dir = NULL) {
+    dir <- create_temp_dir(dir)
+
+    # Call python script
+    system2("python3", c("Scanpy_variable_feat_selection.py", dir))
+    
+    # read in input csv
+    input_mat_scanpy <- t(read.csv(file.path(dir, "highly_var_genes_scanpy_input.csv"), row.names = 1))
+    output_mat_scanpy <- read.csv(file.path(dir, "highly_var_genes_scanpy_output.csv"), row.names = 1)
+    # filter output mat to only where "highly_variable" is true
+    output_mat_scanpy$highly_variable <- as.logical(output_mat_scanpy$highly_variable)
+    output_mat_scanpy <- output_mat_scanpy[output_mat_scanpy$highly_variable,] %>% 
+        dplyr::arrange(desc(dispersions_norm)) %>% 
+        dplyr::select(-highly_variable) %>% # convert rownames to a column
+        tibble::rownames_to_column("name") %>%
+        dplyr::as_tibble()
+
+    # Scanpy undoes a log1p transformation on the input matrix, so we do the same here
+    input_mat_bpcells <- expm1(input_mat_scanpy)
+    
+    output_bpcells <- highly_variable_features(
+        input_mat_bpcells %>% as("dgCMatrix") %>% as("IterableMatrix"),
+        num_feats = 50,
+        n_bins = 20,
+        save_feat_selection = TRUE
+    )
+    output_mat_bpcells <- output_bpcells$feature_selection
+    expect_true(all.equal(output_mat_bpcells$name, output_mat_scanpy$name))
+    expect_true(all.equal(output_mat_bpcells$mean, output_mat_scanpy$means, tolerance = 1e-6))
+    expect_true(all.equal(output_mat_bpcells$dispersion, output_mat_scanpy$dispersions, tolerance = 1e-6))
+    expect_true(all.equal(output_mat_bpcells$feature_dispersion_norm, output_mat_scanpy$dispersions_norm, tolerance = 1e-6))
+}
+
+compare_feat_selection_to_scanpy()
diff --git a/r/tests/testthat/test-clustering.R b/r/tests/testthat/test-clustering.R
index b17723c2..d5a692c0 100644
--- a/r/tests/testthat/test-clustering.R
+++ b/r/tests/testthat/test-clustering.R
@@ -19,11 +19,10 @@ test_that("C++ SNN calculation works",{
                 neighbor_sim[i,] <- sample.int(cells, k)
             }
         }
-
+        input <- list(idx = neighbor_sim, dist = matrix(runif(cells*k), nrow=cells))
         min_val <- 1/15
-        snn <- knn_to_snn_graph(list(idx=neighbor_sim), min_val=min_val, self_loops=TRUE)
-
-        mat <- knn_to_graph(list(idx=neighbor_sim), use_weights=FALSE)
+        snn <- knn_to_snn_graph(input, min_val=min_val, self_loops=TRUE)
+        mat <- knn_to_graph(input, use_weights=FALSE)
 
         mat <- mat %*% t(mat)
         mat <- mat / (2 * k - mat)
@@ -35,7 +34,7 @@ test_that("C++ SNN calculation works",{
             snn,
             as(mat, "dgCMatrix")
         )
-        snn2 <- knn_to_snn_graph(list(idx=neighbor_sim), min_val=min_val, self_loops=FALSE)
+        snn2 <- knn_to_snn_graph(input, min_val=min_val, self_loops=FALSE)
         diag(mat) <- 0
         mat <- Matrix::drop0(mat)
         expect_identical(
@@ -73,4 +72,23 @@ test_that("igraph clustering doesn't crash", {
     })
 
     expect_no_condition(cluster_graph_louvain(graph))
+})
+
+test_that("cluster_cells_graph works", {
+    skip_if_not_installed("RcppAnnoy")
+    skip_if_not_installed("RcppHNSW")
+    mat <- matrix(sample.int(1000, 10000, replace=TRUE), nrow=1000)
+    # check with default params
+    res <- expect_no_error(cluster_cells_graph(mat))
+    # check with threads, method partialization
+    expect_true(class(res) == "factor")
+    expect_equal(nrow(mat), length(res))
+    res_partialized <- expect_no_error(
+        cluster_cells_graph(
+            mat, knn_method = knn_annoy(k = 9),
+            knn_to_graph_method = knn_to_snn_graph(min_val = 1/10),
+            graph_to_cluster_method = cluster_graph_louvain(resolution = 0.8),
+        ))
+    expect_true(class(res) == "factor")
+    expect_equal(nrow(mat), length(res))
 })
\ No newline at end of file
diff --git a/r/tests/testthat/test-errorChecking.R b/r/tests/testthat/test-errorChecking.R
new file mode 100644
index 00000000..355a5d29
--- /dev/null
+++ b/r/tests/testthat/test-errorChecking.R
@@ -0,0 +1,19 @@
+# Copyright 2025 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+test_that("assert_is_mat works", {
+    mat <- matrix(1:4, nrow = 2)
+    mat_dgc <- as(mat, "dgCMatrix")
+    mat_iterable <- as(mat, "IterableMatrix")
+    expect_no_error(assert_is_mat(mat))
+    expect_no_error(assert_is_mat(mat_dgc))
+    expect_error(assert_is_mat(c(mat_iterable, mat_iterable)))
+    expect_error(assert_is_mat("a"))
+    expect_error(assert_is_mat(c("a", "a")))
+    expect_error(assert_is_mat(1))
+})
\ No newline at end of file
diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R
index 385605e0..a8655375 100644
--- a/r/tests/testthat/test-matrix_transforms.R
+++ b/r/tests/testthat/test-matrix_transforms.R
@@ -346,3 +346,61 @@ test_that("linear regression works", {
     expect_equal(as(m1, "matrix"), ans)
     expect_equal(as(m1t, "matrix"), ans)
 })
+
+test_that("tf-idf normalization works", {
+    m <- generate_sparse_matrix(5, 5)
+    rownames(m) <- paste0("row", seq_len(nrow(m)))
+    rev_rownames <- rev(rownames(m))
+    # Create tf-idf normalization for dgCMatrix
+    res_dgc <- log1p((diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix")) * 1e4)
+    
+    rownames(res_dgc) <- rownames(m)
+    m2 <- as(m, "IterableMatrix")
+    # Check that we can pass in row means as a (named) vector
+    row_means <- matrix_stats(m2, row_stats = c("mean"))$row_stats["mean",]
+    # Test that row means ordering does not matter as long as names exist
+    row_means_shuffled <- row_means[sample(1:length(row_means))]
+    # Test that row means can have an extra element as long as all rownames are in the vector
+    row_means_plus_one <- c(row_means, row6 = 1)
+    # Check when row means has no row names
+    row_means_no_names <- unname(row_means)
+    
+
+    res <- normalize_tfidf(m2)
+    expect_equal(res %>% as("dgCMatrix"), res_dgc, tolerance = 1e-6)
+    res_with_row_means <- normalize_tfidf(m2, feature_means = row_means)
+    res_with_row_means_partial <- normalize_tfidf(feature_means = row_means)(m2)
+    expect_equal(res_with_row_means, res_with_row_means_partial)
+    expect_identical(res, res_with_row_means)
+
+    res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled)
+    expect_identical(res_with_row_means, res_with_shuffled_row_means, res)
+
+    res_with_row_means_with_extra_element <- normalize_tfidf(m2, feature_means = row_means_plus_one)
+    expect_identical(res, res_with_row_means_with_extra_element)
+    
+    # Check cases where names exists in either row means xor rownames(mat)
+    # check where both don't have names
+    res_with_unnamed_row_means <- normalize_tfidf(m2, feature_means = row_means_no_names)
+    expect_identical(res_with_unnamed_row_means, res)
+    rownames(m2) <- NULL
+    res_with_unnamed_row_names <- normalize_tfidf(m2, feature_means = row_means)
+    res_with_unnamed_row_names_row_means <- normalize_tfidf(m2, feature_means = row_means_no_names)
+    rownames(res) <- NULL
+    expect_identical(as(res_with_unnamed_row_names, "dgCMatrix"), as(res, "dgCMatrix"))
+    expect_identical(as(res_with_unnamed_row_names_row_means, "dgCMatrix"), as(res, "dgCMatrix"))
+})
+
+test_that("normalize_log works", {
+    m <- generate_sparse_matrix(5, 5)
+    res_dgc <- m %*% diag(1/colSums(m)) %>% as("dgCMatrix")
+    m2 <- as(m, "IterableMatrix")
+    # Test that default params yield the same as log1p on dgCMatrix
+    res_1 <- as(normalize_log(m2), "dgCMatrix")
+    expect_equal(res_1, log1p(res_dgc*1e4), tolerance = 1e-6)
+    
+    # Test that changing scale factor works
+    res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix")
+    res_2_partial <- as(normalize_log(scale_factor = 1e5)(m2), "dgCMatrix")
+    expect_equal(res_2, log1p(res_dgc*1e5), tolerance = 1e-6)
+})
\ No newline at end of file
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 85ad0be3..aa941d3c 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -1,4 +1,4 @@
-# Copyright 2023 BPCells contributors
+# Copyright 2025 BPCells contributors
 # 
 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 # https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
@@ -13,6 +13,39 @@ generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val =
   as(m, "dgCMatrix")
 }
 
+generate_dense_matrix <- function(nrow, ncol) {
+  m <- matrix(runif(nrow * ncol), nrow = nrow)
+}
+
+generate_dense_matrix <- function(nrow, ncol) {
+  m <- matrix(runif(nrow * ncol), nrow = nrow)
+}
+
+test_that("select_features works general case", {
+  m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix")
+  for (fn in c("select_features_variance", "select_features_dispersion", "select_features_mean")) {
+    res <- do.call(fn, list(m1, num_feats = 5))
+    expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting
+    expect_equal(sum(res$highly_variable), 5) # Only 10 features marked as highly variable
+    expect_setequal(res$feature, rownames(m1))
+    res_more_feats_than_rows <- suppressWarnings(do.call(fn, list(m1, num_feats = 10000))) # more features than rows
+    res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100))
+    res_feats_partial <- get(fn)(num_feats = 100)(m1)
+    expect_identical(res_feats_equal_rows, res_feats_partial)
+    expect_identical(res_more_feats_than_rows, res_feats_equal_rows)
+    if (fn == "select_features_variance") {
+      # Check that normalization actually does something
+      res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize_method = NULL))
+      # Check that we can do partial functions on normalization too
+      res_norm_partial <- do.call(fn, list(m1, num_feats = 10, normalize_method = normalize_log(scale = 1e3, threads = 1L)))
+      res_norm_implicit_partial <- select_features_variance(normalize_method = normalize_log(scale_factor = 1e3), num_feats = 10)(m1)
+      expect_identical(res_norm_partial, res_norm_implicit_partial)
+      expect_true(!all((res_no_norm %>% dplyr::arrange(feature))$score == (res_norm_partial %>% dplyr::arrange(feature))$score))
+    }
+  }
+})
+
+
 test_that("Wilcoxon rank sum works (small)", {
     x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46)
     y <- c(1.15, 0.88, 0.90, 0.74, 1.21)
@@ -161,4 +194,84 @@ test_that("Pseudobulk aggregation works with multiple return types", {
       }
     }
   }
+})
+
+
+
+test_that("Feature selection by bin variance works", {
+    mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix")
+    # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
+    res_table <- select_features_binned_dispersion(mat, num_feats = 10, n_bins = 5, threads = 1)
+    res_table_t <- select_features_binned_dispersion(t(mat), num_feats = 10, n_bins = 5, threads = 1)
+    res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(feature) 
+    res <- mat[res_feats,]
+    res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(feature)
+    res_t <- t(mat[,res_feats_t])
+    
+    expect_equal(nrow(res), 10)
+    expect_equal(ncol(res), 26)
+    expect_equal(nrow(res_t), 10)
+    expect_equal(ncol(res_t), 500)
+})
+
+test_that("LSI works", {
+    set.seed(12345)
+    mat <- matrix(runif(240), nrow=10) %>% as("dgCMatrix") %>% as("IterableMatrix")
+    rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
+    colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
+    # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to ArchR
+    n_dimensions <- 5
+    lsi_res_obj <- LSI(mat, n_dimensions = n_dimensions)
+    lsi_res_t_obj <- LSI(t(mat), n_dimensions = n_dimensions)
+    lsi_res <- lsi_res_obj$cell_embeddings
+    lsi_res_t <- lsi_res_t_obj$cell_embeddings
+    # Check that projection results in the same output if used on the same input matrix
+    lsi_res_proj <- project(lsi_res_obj, mat)
+    # Check setting pca correlations to non-1 value
+    lsi_res_obj_corr <- LSI(mat, n_dimensions = n_dimensions, corr_cutoff = 0.2)
+
+    expect_equal(ncol(lsi_res), 5)
+    expect_equal(nrow(lsi_res), ncol(mat))
+    expect_equal(ncol(lsi_res_t), 5)
+    expect_equal(nrow(lsi_res_t), nrow(mat))
+    expect_equal(nrow(lsi_res_proj), ncol(mat))
+    expect_lt(ncol(lsi_res_obj_corr$cell_embeddings), n_dimensions)
+    expect_equal(lsi_res, lsi_res_proj, tolerance = 1e-7)
+})
+
+test_that("Iterative LSI works", {
+  skip_if_not_installed("RcppHNSW")
+  mat <- matrix(data = runif(50000, 0, 1), nrow=500, ncol = 100) %>% as("dgCMatrix") %>% as("IterableMatrix")
+  rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
+  colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
+  lsi_res_obj <- expect_no_error(IterativeLSI(mat, n_iterations = 2, n_dimensions = 10L, cluster_method = cluster_cells_graph(knn_method = knn_annoy)))
+  lsi_res_proj <- project(lsi_res_obj, mat)
+  lsi_res_proj_iter_1 <- expect_no_error(project(lsi_res_obj, mat, iteration = 1L))
+  lsi_res_embedding <- lsi_res_obj$cell_embeddings
+  expect_equal(nrow(lsi_res_embedding), ncol(mat))
+  expect_equal(ncol(lsi_res_embedding), 10)
+  expect_equal(nrow(lsi_res_proj_iter_1), ncol(mat))
+  expect_equal(ncol(lsi_res_proj_iter_1), 10)
+  expect_equal(lsi_res_embedding, lsi_res_proj, tolerance = 1e-7)
+})
+
+test_that("Iterative LSI works with parameterized clustering", {
+  skip_if_not_installed("RcppAnnoy")
+  mat <- matrix(data = runif(50000, 0, 1), nrow=500, ncol = 100) %>% as("dgCMatrix") %>% as("IterableMatrix")
+  rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
+  colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
+  lsi_res_obj <- expect_no_error(
+    IterativeLSI(
+      mat, n_dimensions = 10L,
+      cluster_method = cluster_cells_graph(
+        knn_method = knn_annoy(k = 12),
+        knn_to_graph_method = knn_to_snn_graph(min_val = 0.1)
+      )
+    )
+  )
+  lsi_res_proj <- project(lsi_res_obj, mat)
+  lsi_res_embedding <- lsi_res_obj$cell_embeddings
+  expect_equal(nrow(lsi_res_embedding), ncol(mat))
+  expect_equal(ncol(lsi_res_embedding), 10)
+  expect_equal(lsi_res_embedding, lsi_res_proj, tolerance = 1e7)
 })
\ No newline at end of file