bnprks · immanuelazn · Dec 12, 2024 · Dec 12, 2024 · Dec 14, 2024 · Dec 14, 2024
diff --git a/r/NAMESPACE b/r/NAMESPACE
@@ -65,6 +65,8 @@ export(min_by_row)
 export(min_scalar)
 export(multiply_cols)
 export(multiply_rows)
+export(normalize_log)
+export(normalize_tfidf)
 export(nucleosome_counts)
 export(open_fragments_10x)
 export(open_fragments_dir)

diff --git a/r/NEWS.md b/r/NEWS.md
@@ -10,6 +10,7 @@ Contributions welcome :)
 
 ## Features
 - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166)
+- Add normalization helper functions `normalize_log()` and `normalize_tfidf()` (pull request #168)
 
 ## Bug-fixes
 - Fix error message printing when MACS crashes during `call_peaks_macs()` (pull request #175) 
@@ -39,6 +40,7 @@ of the new features this release and will continue to help with maintenance and
 - Add `rowQuantiles()` and `colQuantiles()` functions, which return the quantiles of each row/column of a matrix. Currently `rowQuantiles()` only works on row-major matrices and `colQuantiles()` only works on col-major matrices.
   If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128)
 - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
+- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168)
 
 ## Improvements
 - `trackplot_loop()` now accepts discrete color scales

diff --git a/r/R/transforms.R b/r/R/transforms.R
@@ -923,3 +923,84 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
     vars_to_regress = vars_to_regress
   )
 }
+
+#################
+# Normalizations
+#################
+
+#' Normalization helper functions
+#' 
+#' Apply standard normalizations to a `(features x cells)` counts matrix.
+#' 
+#' @rdname normalize
+#' @param mat (IterableMatrix) Counts matrix to normalize. `(features x cells)`
+#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization.
+#' @param threads (integer) Number of threads to use.
+#' @returns For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, 
+#'  transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as:
+#'  
+#'   - `normalize_log`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
+#' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog`
+#' @export
+normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
+  # browser()
+  if (rlang::is_missing(mat)) {
+    return(
+      partial_explicit(
+        normalize_log, scale_factor = scale_factor, threads = threads
+      )
+    )
+  }
+  assert_is(mat, "IterableMatrix")
+  assert_is_numeric(scale_factor)
+  assert_greater_than_zero(scale_factor)
+  read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
+  mat <- mat %>% multiply_cols(1 / read_depth)
+  return(log1p(mat * scale_factor))
+}
+
+
+#' @rdname normalize
+#' @param feature_means (numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then 
+#' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
+#' Else, map each feature name to its mean value.
+#' @returns - `normalize_tfidf`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
+#' @details - `normalize_tfidf`: This follows the formula from Stuart, Butler et al. 2019, used by default in `ArchR::addIterativeLSI()` and `Signac::RunTFIDF()`
+#' @export
+normalize_tfidf <- function(
+  mat, feature_means = NULL,
+  scale_factor = 1e4, threads = 1L
+) {
+  if (rlang::is_missing(mat)) {
+    return(
+      partial_explicit(
+        normalize_tfidf, feature_means = feature_means, 
+        scale_factor = scale_factor, threads = threads
+      )
+    )
+  }
+  assert_is(mat, "IterableMatrix")
+  assert_is_wholenumber(threads)
+  # If feature means are passed in, only need to calculate term frequency
+  if (is.null(feature_means)) {
+    mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
+    feature_means <- mat_stats$row_stats["mean", ]
+    read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
+  } else {
+    assert_is_numeric(feature_means)
+    if (!is.null(names(feature_means)) && !is.null(rownames(mat))) {
+      # Make sure every name in feature means exists in rownames(mat)
+      # In the case there is a length mismatch but the feature names all exist in feature_means
+      # will not error out
+      assert_true(all(rownames(mat) %in% names(feature_means)))
+      feature_means <- feature_means[rownames(mat)]
+    } else {
+      assert_len(feature_means, nrow(mat))
+    }
+    read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
+  }
+  tf <- mat %>% multiply_cols(1 / read_depth)
+  idf <- 1 / feature_means
+  tf_idf_mat <- tf %>% multiply_rows(idf)
+  return(log1p(tf_idf_mat * scale_factor))
+}
diff --git a/r/R/utils.R b/r/R/utils.R
@@ -56,4 +56,20 @@ log_progress <- function(msg, add_timestamp = TRUE){
   } else {
     message(msg)
   }
+}
+
+# Helper function to create partial explicit functions
+# This builds upon purrr::partial by allowing for nested partial calls, where each partial call
+# only does partial application of the arguments that were explicitly provided.
+partial_explicit <- function(fn, ...) {
+  args <- rlang::enquos(...)
+  evaluated_args <- purrr::map(args, rlang::eval_tidy)
+  # Fetch the default arguments from the function definition
+  default_args <- formals(fn)
+  # Keep only explicitly provided arguments that were evaluated
+  # where the values are different from the default arguments
+  explicitly_passed_args <- evaluated_args[names(evaluated_args) %in% names(default_args) & 
+                                            !purrr::map2_lgl(evaluated_args, default_args[names(evaluated_args)], identical)]
+  # Return a partially applied version of the function using evaluated arguments
+  return(purrr::partial(fn, !!!explicitly_passed_args))
 }
diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
@@ -9,6 +9,11 @@ template:
   includes:
     in_header: |
        <script defer data-domain="benparks.net" src="https://plausible.benparks.net/js/visit-counts.js"></script>
+       <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.15/dist/katex.min.css" integrity="sha384-Htz9HMhiwV8GuQ28Xr9pEs1B4qJiYu/nYLLwlDklR53QibDfmQzi7rYxXhMH/5/u" crossorigin="anonymous">
+       <!-- The loading of KaTeX is deferred to speed up page rendering -->
+       <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.15/dist/katex.min.js" integrity="sha384-bxmi2jLGCvnsEqMuYLKE/KsVCxV3PqmKeK6Y6+lmNXBry6+luFkEOsmp5vD9I/7+" crossorigin="anonymous"></script>
+       <!-- To automatically render math in text elements, include the auto-render extension: -->
+       <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.15/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous" onload="renderMathInElement(document.body);"></script>
     after_body: |
        <img src="https://plausible.benparks.net/flask-plausible/bpcells-docs.png" style="position:absolute;" />
 
@@ -130,15 +135,11 @@ reference:
   - IterableMatrix-methods
   - pseudobulk_matrix
 
-- title: "Reference Annotations"
+- title: "Single-cell analysis helpers"
+- subtitle: "Dimensionality reduction"
 - contents:
-  - human_gene_mapping
-  - match_gene_symbol
-  - read_gtf
-  - read_bed
-  - read_ucsc_chrom_sizes
-
-- title: "Clustering"
+  - normalize_log
+- subtitle: "Clustering"
 - contents:
   - knn_hnsw
   - cluster_graph_leiden
@@ -176,3 +177,11 @@ reference:
   - discrete_palette
   - collect_features
   - rotate_x_labels
+
+- title: "Reference Annotations"
+- contents:
+  - human_gene_mapping
+  - match_gene_symbol
+  - read_gtf
+  - read_bed
+  - read_ucsc_chrom_sizes
diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R
@@ -346,3 +346,48 @@ test_that("linear regression works", {
     expect_equal(as(m1, "matrix"), ans)
     expect_equal(as(m1t, "matrix"), ans)
 })
+
+test_that("tf-idf normalization works", {
+    m <- generate_sparse_matrix(5, 5)
+    rownames(m) <- paste0("row", seq_len(nrow(m)))
+    rev_rownames <- rev(rownames(m))
+    # Create tf-idf normalization for dgCMatrix
+    res_dgc <- log1p((diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix")) * 1e4)
+
+    rownames(res_dgc) <- rownames(m)
+    m2 <- as(m, "IterableMatrix")
+    # Check that we can pass in row means as a (named) vector
+    row_means <- matrix_stats(m2, row_stats = c("mean"))$row_stats["mean",]
+    # Test that row means ordering does not matter as long as names exist
+    row_means_shuffled <- row_means[sample(1:length(row_means))]
+    # Test that row means can have an extra element as long as all rownames are in the vector
+    row_means_plus_one <- c(row_means, row6 = 1)
+
+
+    res <- normalize_tfidf(m2)
+    expect_equal(res %>% as("dgCMatrix"), res_dgc, tolerance = 1e-6)
+    res_with_row_means <- normalize_tfidf(m2, feature_means = row_means)
+    res_with_row_means_partial <- normalize_tfidf(feature_means = row_means)(m2)
+    expect_equal(res_with_row_means, res_with_row_means_partial)
+    expect_identical(res, res_with_row_means)
+
+    res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled)
+    expect_identical(res_with_row_means, res_with_shuffled_row_means, res)
+
+    res_with_row_means_with_extra_element <- normalize_tfidf(m2, feature_means = row_means_plus_one)
+    expect_identical(res, res_with_row_means_with_extra_element)
+})
+
+test_that("normalize_log works", {
+    m <- generate_sparse_matrix(5, 5)
+    res_dgc <- m %*% diag(1/colSums(m)) %>% as("dgCMatrix")
+    m2 <- as(m, "IterableMatrix")
+    # Test that default params yield the same as log1p on dgCMatrix
+    res_1 <- as(normalize_log(m2), "dgCMatrix")
+    expect_equal(res_1, log1p(res_dgc*1e4), tolerance = 1e-6)
+
+    # Test that changing scale factor works
+    res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix")
+    res_2_partial <- as(normalize_log(scale_factor = 1e5)(m2), "dgCMatrix")
+    expect_equal(res_2, log1p(res_dgc*1e5), tolerance = 1e-6)
+})