diff --git a/r/NAMESPACE b/r/NAMESPACE index 982ca440..5cc21059 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -65,6 +65,8 @@ export(min_by_row) export(min_scalar) export(multiply_cols) export(multiply_rows) +export(normalize_log) +export(normalize_tfidf) export(nucleosome_counts) export(open_fragments_10x) export(open_fragments_dir) diff --git a/r/NEWS.md b/r/NEWS.md index 6cab91a5..ac2109fb 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -10,6 +10,7 @@ Contributions welcome :) ## Features - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166) +- Add normalization helper functions `normalize_log()` and `normalize_tfidf()` (pull request #168) ## Bug-fixes - Fix error message printing when MACS crashes during `call_peaks_macs()` (pull request #175) @@ -39,6 +40,7 @@ of the new features this release and will continue to help with maintenance and - Add `rowQuantiles()` and `colQuantiles()` functions, which return the quantiles of each row/column of a matrix. Currently `rowQuantiles()` only works on row-major matrices and `colQuantiles()` only works on col-major matrices. If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128) - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128) +- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168) ## Improvements - `trackplot_loop()` now accepts discrete color scales diff --git a/r/R/transforms.R b/r/R/transforms.R index 2a5de994..02ab51b8 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -923,3 +923,84 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { vars_to_regress = vars_to_regress ) } + +################# +# Normalizations +################# + +#' Normalization helper functions +#' +#' Apply standard normalizations to a `(features x cells)` counts matrix. +#' +#' @rdname normalize +#' @param mat (IterableMatrix) Counts matrix to normalize. `(features x cells)` +#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization. +#' @param threads (integer) Number of threads to use. +#' @returns For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +#' transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as: +#' +#' - `normalize_log`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} +#' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog` +#' @export +normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) { + # browser() + if (rlang::is_missing(mat)) { + return( + partial_explicit( + normalize_log, scale_factor = scale_factor, threads = threads + ) + ) + } + assert_is(mat, "IterableMatrix") + assert_is_numeric(scale_factor) + assert_greater_than_zero(scale_factor) + read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat) + mat <- mat %>% multiply_cols(1 / read_depth) + return(log1p(mat * scale_factor)) +} + + +#' @rdname normalize +#' @param feature_means (numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then +#' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. +#' Else, map each feature name to its mean value. +#' @returns - `normalize_tfidf`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} +#' @details - `normalize_tfidf`: This follows the formula from Stuart, Butler et al. 2019, used by default in `ArchR::addIterativeLSI()` and `Signac::RunTFIDF()` +#' @export +normalize_tfidf <- function( + mat, feature_means = NULL, + scale_factor = 1e4, threads = 1L +) { + if (rlang::is_missing(mat)) { + return( + partial_explicit( + normalize_tfidf, feature_means = feature_means, + scale_factor = scale_factor, threads = threads + ) + ) + } + assert_is(mat, "IterableMatrix") + assert_is_wholenumber(threads) + # If feature means are passed in, only need to calculate term frequency + if (is.null(feature_means)) { + mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) + feature_means <- mat_stats$row_stats["mean", ] + read_depth <- mat_stats$col_stats["mean", ] * nrow(mat) + } else { + assert_is_numeric(feature_means) + if (!is.null(names(feature_means)) && !is.null(rownames(mat))) { + # Make sure every name in feature means exists in rownames(mat) + # In the case there is a length mismatch but the feature names all exist in feature_means + # will not error out + assert_true(all(rownames(mat) %in% names(feature_means))) + feature_means <- feature_means[rownames(mat)] + } else { + assert_len(feature_means, nrow(mat)) + } + read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat) + } + tf <- mat %>% multiply_cols(1 / read_depth) + idf <- 1 / feature_means + tf_idf_mat <- tf %>% multiply_rows(idf) + return(log1p(tf_idf_mat * scale_factor)) +} \ No newline at end of file diff --git a/r/R/utils.R b/r/R/utils.R index 4ea62d15..784f6106 100644 --- a/r/R/utils.R +++ b/r/R/utils.R @@ -56,4 +56,20 @@ log_progress <- function(msg, add_timestamp = TRUE){ } else { message(msg) } +} + +# Helper function to create partial explicit functions +# This builds upon purrr::partial by allowing for nested partial calls, where each partial call +# only does partial application of the arguments that were explicitly provided. +partial_explicit <- function(fn, ...) { + args <- rlang::enquos(...) + evaluated_args <- purrr::map(args, rlang::eval_tidy) + # Fetch the default arguments from the function definition + default_args <- formals(fn) + # Keep only explicitly provided arguments that were evaluated + # where the values are different from the default arguments + explicitly_passed_args <- evaluated_args[names(evaluated_args) %in% names(default_args) & + !purrr::map2_lgl(evaluated_args, default_args[names(evaluated_args)], identical)] + # Return a partially applied version of the function using evaluated arguments + return(purrr::partial(fn, !!!explicitly_passed_args)) } \ No newline at end of file diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd new file mode 100644 index 00000000..ac53f2c0 --- /dev/null +++ b/r/man/normalize.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/transforms.R +\name{normalize_log} +\alias{normalize_log} +\alias{normalize_tfidf} +\title{Normalization helper functions} +\usage{ +normalize_log(mat, scale_factor = 10000, threads = 1L) + +normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L) +} +\arguments{ +\item{mat}{(IterableMatrix) Counts matrix to normalize. \verb{(features x cells)}} + +\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.} + +\item{threads}{(integer) Number of threads to use.} + +\item{feature_means}{(numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then +each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. +Else, map each feature name to its mean value.} +} +\value{ +For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as: +\itemize{ +\item \code{normalize_log}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} +} + +\itemize{ +\item \code{normalize_tfidf}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} +} +} +\description{ +Apply standard normalizations to a \verb{(features x cells)} counts matrix. +} +\details{ +\itemize{ +\item \code{normalize_log}: Corresponds to \code{Seurat::NormalizeLog} +} + +\itemize{ +\item \code{normalize_tfidf}: This follows the formula from Stuart, Butler et al. 2019, used by default in \code{ArchR::addIterativeLSI()} and \code{Signac::RunTFIDF()} +} +} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index 64d16f01..ff05e434 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -9,6 +9,11 @@ template: includes: in_header: | + + + + + after_body: | @@ -130,15 +135,11 @@ reference: - IterableMatrix-methods - pseudobulk_matrix -- title: "Reference Annotations" +- title: "Single-cell analysis helpers" +- subtitle: "Dimensionality reduction" - contents: - - human_gene_mapping - - match_gene_symbol - - read_gtf - - read_bed - - read_ucsc_chrom_sizes - -- title: "Clustering" + - normalize_log +- subtitle: "Clustering" - contents: - knn_hnsw - cluster_graph_leiden @@ -176,3 +177,11 @@ reference: - discrete_palette - collect_features - rotate_x_labels + +- title: "Reference Annotations" +- contents: + - human_gene_mapping + - match_gene_symbol + - read_gtf + - read_bed + - read_ucsc_chrom_sizes diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R index 385605e0..24cd9c23 100644 --- a/r/tests/testthat/test-matrix_transforms.R +++ b/r/tests/testthat/test-matrix_transforms.R @@ -346,3 +346,48 @@ test_that("linear regression works", { expect_equal(as(m1, "matrix"), ans) expect_equal(as(m1t, "matrix"), ans) }) + +test_that("tf-idf normalization works", { + m <- generate_sparse_matrix(5, 5) + rownames(m) <- paste0("row", seq_len(nrow(m))) + rev_rownames <- rev(rownames(m)) + # Create tf-idf normalization for dgCMatrix + res_dgc <- log1p((diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix")) * 1e4) + + rownames(res_dgc) <- rownames(m) + m2 <- as(m, "IterableMatrix") + # Check that we can pass in row means as a (named) vector + row_means <- matrix_stats(m2, row_stats = c("mean"))$row_stats["mean",] + # Test that row means ordering does not matter as long as names exist + row_means_shuffled <- row_means[sample(1:length(row_means))] + # Test that row means can have an extra element as long as all rownames are in the vector + row_means_plus_one <- c(row_means, row6 = 1) + + + res <- normalize_tfidf(m2) + expect_equal(res %>% as("dgCMatrix"), res_dgc, tolerance = 1e-6) + res_with_row_means <- normalize_tfidf(m2, feature_means = row_means) + res_with_row_means_partial <- normalize_tfidf(feature_means = row_means)(m2) + expect_equal(res_with_row_means, res_with_row_means_partial) + expect_identical(res, res_with_row_means) + + res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled) + expect_identical(res_with_row_means, res_with_shuffled_row_means, res) + + res_with_row_means_with_extra_element <- normalize_tfidf(m2, feature_means = row_means_plus_one) + expect_identical(res, res_with_row_means_with_extra_element) +}) + +test_that("normalize_log works", { + m <- generate_sparse_matrix(5, 5) + res_dgc <- m %*% diag(1/colSums(m)) %>% as("dgCMatrix") + m2 <- as(m, "IterableMatrix") + # Test that default params yield the same as log1p on dgCMatrix + res_1 <- as(normalize_log(m2), "dgCMatrix") + expect_equal(res_1, log1p(res_dgc*1e4), tolerance = 1e-6) + + # Test that changing scale factor works + res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix") + res_2_partial <- as(normalize_log(scale_factor = 1e5)(m2), "dgCMatrix") + expect_equal(res_2, log1p(res_dgc*1e5), tolerance = 1e-6) +}) \ No newline at end of file