From 1ef609162089418cfa45f14e6a3173caeed022ee Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Thu, 12 Dec 2024 14:49:59 -0800 Subject: [PATCH 1/9] [r] add tf-idf and log normalization functions --- r/NAMESPACE | 2 + r/NEWS.md | 1 + r/R/transforms.R | 54 +++++++++++++++++++++++ r/man/normalize_log.Rd | 21 +++++++++ r/man/normalize_tfidf.Rd | 21 +++++++++ r/pkgdown/_pkgdown.yml | 2 + r/tests/testthat/test-matrix_transforms.R | 46 +++++++++++++++++++ 7 files changed, 147 insertions(+) create mode 100644 r/man/normalize_log.Rd create mode 100644 r/man/normalize_tfidf.Rd diff --git a/r/NAMESPACE b/r/NAMESPACE index 8625c143..622d3a88 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -65,6 +65,8 @@ export(min_by_row) export(min_scalar) export(multiply_cols) export(multiply_rows) +export(normalize_log) +export(normalize_tfidf) export(nucleosome_counts) export(open_fragments_10x) export(open_fragments_dir) diff --git a/r/NEWS.md b/r/NEWS.md index 9715c95a..890f2755 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -22,6 +22,7 @@ Contributions welcome :) - Add `rowQuantiles()` and `colQuantiles()` functions, which return the quantiles of each row/column of a matrix. Currently `rowQuantiles()` only works on row-major matrices and `colQuantiles()` only works on col-major matrices. If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128) - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128) +- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168) ## Improvements - `trackplot_loop()` now accepts discrete color scales diff --git a/r/R/transforms.R b/r/R/transforms.R index 2a5de994..7d1c46f8 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -923,3 +923,57 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { vars_to_regress = vars_to_regress ) } + +################# +# Normalizations +################# + +#' Normalize a matrix using log normalization +#' @param mat (IterableMatrix) Matrix to normalize +#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization +#' @param add_one (logical) Add one to the matrix before log normalization +#' @returns log normalized matrix. +#' @export +normalize_log <- function(mat, scale_factor = 1e4, add_one = TRUE) { + assert_is(mat, "IterableMatrix") + assert_is_numeric(scale_factor) + assert_true(is.logical(add_one)) + assert_greater_than_zero(scale_factor) + mat <- mat * scale_factor + if (!add_one) mat <- mat - 1 + return(log1p(mat)) +} + + +#' Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency +#' @param mat (IterableMatrix) to normalize +#' @param feature_means (numeric) Means of the features to normalize by. If no names are provided, then +#' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. +#' Else, map each feature name to its mean value. +#' @returns tf-idf normalized matrix. +#' @export +normalize_tfidf <- function(mat, feature_means = NULL, threads = 1L) { + assert_is(mat, "IterableMatrix") + assert_is_wholenumber(threads) + # If feature means are passed in, only need to calculate term frequency + if (is.null(feature_means)) { + mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) + feature_means <- mat_stats$row_stats["mean", ] + read_depth <- mat_stats$col_stats["mean", ] * nrow(mat) + } else { + assert_is_numeric(feature_means) + if (!is.null(names(feature_means)) && !is.null(rownames(mat))) { + # Make sure every name in feature means exists in rownames(mat) + # In the case there is a length mismatch but the feature names all exist in feature_means + # will not error out + assert_true(all(rownames(mat) %in% names(feature_means))) + feature_means <- feature_means[rownames(mat)] + } else { + assert_len(feature_means, nrow(mat)) + } + read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean",] * nrow(mat) + } + tf <- mat %>% multiply_cols(1 / read_depth) + idf <- 1 / feature_means + return(tf %>% multiply_rows(idf)) +} \ No newline at end of file diff --git a/r/man/normalize_log.Rd b/r/man/normalize_log.Rd new file mode 100644 index 00000000..90a57f85 --- /dev/null +++ b/r/man/normalize_log.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/transforms.R +\name{normalize_log} +\alias{normalize_log} +\title{Normalize a matrix using log normalization} +\usage{ +normalize_log(mat, scale_factor = 10000, add_one = TRUE) +} +\arguments{ +\item{mat}{(IterableMatrix) Matrix to normalize} + +\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization} + +\item{add_one}{(logical) Add one to the matrix before log normalization} +} +\value{ +log normalized matrix. +} +\description{ +Normalize a matrix using log normalization +} diff --git a/r/man/normalize_tfidf.Rd b/r/man/normalize_tfidf.Rd new file mode 100644 index 00000000..bf6a34ef --- /dev/null +++ b/r/man/normalize_tfidf.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/transforms.R +\name{normalize_tfidf} +\alias{normalize_tfidf} +\title{Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency} +\usage{ +normalize_tfidf(mat, feature_means = NULL, threads = 1L) +} +\arguments{ +\item{mat}{(IterableMatrix) to normalize} + +\item{feature_means}{(numeric) Means of the features to normalize by. If no names are provided, then +each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. +Else, map each feature name to its mean value.} +} +\value{ +tf-idf normalized matrix. +} +\description{ +Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency +} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index ea73ec01..22237598 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -126,6 +126,8 @@ reference: - checksum - apply_by_row - regress_out + - normalize_log + - normalize_tfidf - IterableMatrix-methods - pseudobulk_matrix diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R index 385605e0..b1941f8b 100644 --- a/r/tests/testthat/test-matrix_transforms.R +++ b/r/tests/testthat/test-matrix_transforms.R @@ -346,3 +346,49 @@ test_that("linear regression works", { expect_equal(as(m1, "matrix"), ans) expect_equal(as(m1t, "matrix"), ans) }) + +test_that("tf-idf normalization works", { + m <- generate_sparse_matrix(5, 5) + rownames(m) <- paste0("row", seq_len(nrow(m))) + rev_rownames <- rev(rownames(m)) + # Create tf-idf normalization for dgCMatrix + res_dgc <- diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix") + + rownames(res_dgc) <- rownames(m) + m2 <- as(m, "IterableMatrix") + # Check that we can pass in row means as a (named) vector + row_means <- matrix_stats(m2, row_stats = c("mean"))$row_stats["mean",] + # Test that row means ordering does not matter as long as names exist + row_means_shuffled <- row_means[sample(1:length(row_means))] + # Test that row means can have an extra element as long as all rownames are in the vector + row_means_plus_one <- c(row_means, row6 = 1) + + + res <- normalize_tfidf(m2) + expect_equal(res %>% as("dgCMatrix"), res_dgc) + res_with_row_means <- normalize_tfidf(m2, feature_means = row_means) + expect_identical(res, res_with_row_means) + + res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled) + expect_identical(res_with_row_means, res_with_shuffled_row_means, res) + + res_with_row_means_with_extra_element <- normalize_tfidf(m2, feature_means = row_means_plus_one) + expect_identical(res, res_with_row_means_with_extra_element) +}) + +test_that("normalize_log works", { + m <- generate_sparse_matrix(5, 5) + m2 <- as(m, "IterableMatrix") + # Test that default params yield the same as log1p on dgCMatrix + res_1 <- as(normalize_log(m2), "dgCMatrix") + expect_equal(res_1, log1p(m*1e4)) + + # Test that changing scale factor works + res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix") + expect_identical(res_2, log1p(m*1e5)) + # Test that removing the add_one works + # log of 0 is -inf, but we don't do that on the c side, and just have really large negative numbers. + res_3 <- as(normalize_log(m2, add_one = FALSE), "dgCMatrix") + res_3@x[res_3@x < -700] <- -Inf + expect_identical(as(res_3, "dgeMatrix"), log(m*1e4)) +}) \ No newline at end of file From 98675d0186f2fa7f016543a9f454c3578a5260ef Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Thu, 12 Dec 2024 15:13:11 -0800 Subject: [PATCH 2/9] [r] fix normalization tests --- r/tests/testthat/test-matrix_transforms.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R index b1941f8b..06bc3337 100644 --- a/r/tests/testthat/test-matrix_transforms.R +++ b/r/tests/testthat/test-matrix_transforms.R @@ -381,14 +381,14 @@ test_that("normalize_log works", { m2 <- as(m, "IterableMatrix") # Test that default params yield the same as log1p on dgCMatrix res_1 <- as(normalize_log(m2), "dgCMatrix") - expect_equal(res_1, log1p(m*1e4)) + expect_equal(res_1, log1p(m*1e4), tolerance = 1e-6) # Test that changing scale factor works res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix") - expect_identical(res_2, log1p(m*1e5)) + expect_equal(res_2, log1p(m*1e5), tolerance = 1e-6) # Test that removing the add_one works # log of 0 is -inf, but we don't do that on the c side, and just have really large negative numbers. res_3 <- as(normalize_log(m2, add_one = FALSE), "dgCMatrix") - res_3@x[res_3@x < -700] <- -Inf - expect_identical(as(res_3, "dgeMatrix"), log(m*1e4)) + res_3@x[res_3@x < -60] <- -Inf + expect_equal(as(res_3, "dgeMatrix"), log(m*1e4), tolerance = 1e-6) }) \ No newline at end of file From 2f83ae647174ea718f0c05505487ab2ebb55393c Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Fri, 13 Dec 2024 17:09:56 -0800 Subject: [PATCH 3/9] [r] add in requested changes --- r/R/transforms.R | 38 ++++++++++++++--------- r/man/normalize_log.Rd | 16 +++++----- r/man/normalize_tfidf.Rd | 16 +++++++--- r/pkgdown/_pkgdown.yml | 5 +++ r/tests/testthat/test-matrix_transforms.R | 12 +++---- 5 files changed, 52 insertions(+), 35 deletions(-) diff --git a/r/R/transforms.R b/r/R/transforms.R index 7d1c46f8..b2dda267 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -928,31 +928,38 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { # Normalizations ################# -#' Normalize a matrix using log normalization -#' @param mat (IterableMatrix) Matrix to normalize -#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization -#' @param add_one (logical) Add one to the matrix before log normalization -#' @returns log normalized matrix. +#' Normalize a `(features x cells)` matrix using log normalization. +#' @param mat (IterableMatrix) Matrix to normalize. +#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization. +#' @param threads (integer) Number of threads to use.s +#' @returns log normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +#' the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: +#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} #' @export -normalize_log <- function(mat, scale_factor = 1e4, add_one = TRUE) { +normalize_log <- function(mat, scale_factor = 1e4, add_one = TRUE, threads = 1L) { assert_is(mat, "IterableMatrix") assert_is_numeric(scale_factor) assert_true(is.logical(add_one)) assert_greater_than_zero(scale_factor) - mat <- mat * scale_factor - if (!add_one) mat <- mat - 1 - return(log1p(mat)) + read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat) + mat <- mat %>% multiply_cols(1 / read_depth) + return(log1p(mat * scale_factor)) } -#' Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency -#' @param mat (IterableMatrix) to normalize +#' Normalize a `(features x cells)` matrix using term frequency-inverse document frequency. #' @param feature_means (numeric) Means of the features to normalize by. If no names are provided, then #' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. #' Else, map each feature name to its mean value. -#' @returns tf-idf normalized matrix. +#' @returns tf-idf normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +#' the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: +#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} +#' @inheritParams normalize_log #' @export -normalize_tfidf <- function(mat, feature_means = NULL, threads = 1L) { +normalize_tfidf <- function( + mat, feature_means = NULL, + scale_factor = 1e4, threads = 1L +) { assert_is(mat, "IterableMatrix") assert_is_wholenumber(threads) # If feature means are passed in, only need to calculate term frequency @@ -971,9 +978,10 @@ normalize_tfidf <- function(mat, feature_means = NULL, threads = 1L) { } else { assert_len(feature_means, nrow(mat)) } - read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean",] * nrow(mat) + read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat) } tf <- mat %>% multiply_cols(1 / read_depth) idf <- 1 / feature_means - return(tf %>% multiply_rows(idf)) + tf_idf_mat <- tf %>% multiply_rows(idf) + return(log1p(tf_idf_mat * scale_factor)) } \ No newline at end of file diff --git a/r/man/normalize_log.Rd b/r/man/normalize_log.Rd index 90a57f85..9de93071 100644 --- a/r/man/normalize_log.Rd +++ b/r/man/normalize_log.Rd @@ -2,20 +2,22 @@ % Please edit documentation in R/transforms.R \name{normalize_log} \alias{normalize_log} -\title{Normalize a matrix using log normalization} +\title{Normalize a \verb{(features x cells)} matrix using log normalization.} \usage{ -normalize_log(mat, scale_factor = 10000, add_one = TRUE) +normalize_log(mat, scale_factor = 10000, add_one = TRUE, threads = 1L) } \arguments{ -\item{mat}{(IterableMatrix) Matrix to normalize} +\item{mat}{(IterableMatrix) Matrix to normalize.} -\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization} +\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.} -\item{add_one}{(logical) Add one to the matrix before log normalization} +\item{threads}{(integer) Number of threads to use.s} } \value{ -log normalized matrix. +log normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: +\eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} } \description{ -Normalize a matrix using log normalization +Normalize a \verb{(features x cells)} matrix using log normalization. } diff --git a/r/man/normalize_tfidf.Rd b/r/man/normalize_tfidf.Rd index bf6a34ef..8dc50b84 100644 --- a/r/man/normalize_tfidf.Rd +++ b/r/man/normalize_tfidf.Rd @@ -2,20 +2,26 @@ % Please edit documentation in R/transforms.R \name{normalize_tfidf} \alias{normalize_tfidf} -\title{Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency} +\title{Normalize a \verb{(features x cells)} matrix using term frequency-inverse document frequency.} \usage{ -normalize_tfidf(mat, feature_means = NULL, threads = 1L) +normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L) } \arguments{ -\item{mat}{(IterableMatrix) to normalize} +\item{mat}{(IterableMatrix) Matrix to normalize.} \item{feature_means}{(numeric) Means of the features to normalize by. If no names are provided, then each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. Else, map each feature name to its mean value.} + +\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.} + +\item{threads}{(integer) Number of threads to use.s} } \value{ -tf-idf normalized matrix. +tf-idf normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: +\eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} } \description{ -Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency +Normalize a \verb{(features x cells)} matrix using term frequency-inverse document frequency. } diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index 22237598..ae2772eb 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -9,6 +9,11 @@ template: includes: in_header: | + + + + + after_body: | diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R index 06bc3337..c4c5edb5 100644 --- a/r/tests/testthat/test-matrix_transforms.R +++ b/r/tests/testthat/test-matrix_transforms.R @@ -352,7 +352,7 @@ test_that("tf-idf normalization works", { rownames(m) <- paste0("row", seq_len(nrow(m))) rev_rownames <- rev(rownames(m)) # Create tf-idf normalization for dgCMatrix - res_dgc <- diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix") + res_dgc <- log1p((diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix")) * 1e4) rownames(res_dgc) <- rownames(m) m2 <- as(m, "IterableMatrix") @@ -378,17 +378,13 @@ test_that("tf-idf normalization works", { test_that("normalize_log works", { m <- generate_sparse_matrix(5, 5) + res_dgc <- m %*% diag(1/colSums(m)) %>% as("dgCMatrix") m2 <- as(m, "IterableMatrix") # Test that default params yield the same as log1p on dgCMatrix res_1 <- as(normalize_log(m2), "dgCMatrix") - expect_equal(res_1, log1p(m*1e4), tolerance = 1e-6) + expect_equal(res_1, log1p(res_dgc*1e4), tolerance = 1e-6) # Test that changing scale factor works res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix") - expect_equal(res_2, log1p(m*1e5), tolerance = 1e-6) - # Test that removing the add_one works - # log of 0 is -inf, but we don't do that on the c side, and just have really large negative numbers. - res_3 <- as(normalize_log(m2, add_one = FALSE), "dgCMatrix") - res_3@x[res_3@x < -60] <- -Inf - expect_equal(as(res_3, "dgeMatrix"), log(m*1e4), tolerance = 1e-6) + expect_equal(res_2, log1p(res_dgc*1e5), tolerance = 1e-6) }) \ No newline at end of file From 6381f74d02982c7972198aa32a042a99b0808069 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Fri, 13 Dec 2024 17:15:56 -0800 Subject: [PATCH 4/9] [r] removed unused variable --- r/R/transforms.R | 3 +-- r/man/normalize_log.Rd | 2 +- r/tests/testthat/test-matrix_transforms.R | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/r/R/transforms.R b/r/R/transforms.R index b2dda267..8a2bd25b 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -936,10 +936,9 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { #' the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: #' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} #' @export -normalize_log <- function(mat, scale_factor = 1e4, add_one = TRUE, threads = 1L) { +normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) { assert_is(mat, "IterableMatrix") assert_is_numeric(scale_factor) - assert_true(is.logical(add_one)) assert_greater_than_zero(scale_factor) read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat) mat <- mat %>% multiply_cols(1 / read_depth) diff --git a/r/man/normalize_log.Rd b/r/man/normalize_log.Rd index 9de93071..97d8c92e 100644 --- a/r/man/normalize_log.Rd +++ b/r/man/normalize_log.Rd @@ -4,7 +4,7 @@ \alias{normalize_log} \title{Normalize a \verb{(features x cells)} matrix using log normalization.} \usage{ -normalize_log(mat, scale_factor = 10000, add_one = TRUE, threads = 1L) +normalize_log(mat, scale_factor = 10000, threads = 1L) } \arguments{ \item{mat}{(IterableMatrix) Matrix to normalize.} diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R index c4c5edb5..67641e54 100644 --- a/r/tests/testthat/test-matrix_transforms.R +++ b/r/tests/testthat/test-matrix_transforms.R @@ -365,7 +365,7 @@ test_that("tf-idf normalization works", { res <- normalize_tfidf(m2) - expect_equal(res %>% as("dgCMatrix"), res_dgc) + expect_equal(res %>% as("dgCMatrix"), res_dgc, tolerance = 1e-6) res_with_row_means <- normalize_tfidf(m2, feature_means = row_means) expect_identical(res, res_with_row_means) From 199ae82355991e882de343128abdf62b99bc504b Mon Sep 17 00:00:00 2001 From: Ben Parks Date: Fri, 10 Jan 2025 16:51:00 -0800 Subject: [PATCH 5/9] Update docs --- r/R/transforms.R | 28 ++++++++++++++----------- r/man/normalize.Rd | 45 ++++++++++++++++++++++++++++++++++++++++ r/man/normalize_log.Rd | 23 -------------------- r/man/normalize_tfidf.Rd | 27 ------------------------ r/pkgdown/_pkgdown.yml | 22 +++++++++++--------- 5 files changed, 73 insertions(+), 72 deletions(-) create mode 100644 r/man/normalize.Rd delete mode 100644 r/man/normalize_log.Rd delete mode 100644 r/man/normalize_tfidf.Rd diff --git a/r/R/transforms.R b/r/R/transforms.R index 8a2bd25b..e79c404d 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -928,13 +928,19 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { # Normalizations ################# -#' Normalize a `(features x cells)` matrix using log normalization. -#' @param mat (IterableMatrix) Matrix to normalize. +#' Normalization recipes +#' +#' Apply standard normalizations to a `(features x cells)` counts matrix. +#' +#' @rdname normalize +#' @param mat (IterableMatrix) Counts matrix to normalize. `(features x cells)` #' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization. -#' @param threads (integer) Number of threads to use.s -#' @returns log normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, -#' the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: -#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} +#' @param threads (integer) Number of threads to use. +#' @returns For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +#' transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as: +#' +#' - `normalize_log`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} +#' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog` #' @export normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) { assert_is(mat, "IterableMatrix") @@ -946,14 +952,12 @@ normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) { } -#' Normalize a `(features x cells)` matrix using term frequency-inverse document frequency. -#' @param feature_means (numeric) Means of the features to normalize by. If no names are provided, then +#' @rdname normalize +#' @param feature_means (numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then #' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. #' Else, map each feature name to its mean value. -#' @returns tf-idf normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, -#' the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: -#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} -#' @inheritParams normalize_log +#' @returns - `normalize_tfidf`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} +#' @details - `normalize_tfidf`: This follows the formula from Stuart, Butler et al. 2019, used by default in `ArchR::addIterativeLSI()` and `Signac::RunTFIDF()` #' @export normalize_tfidf <- function( mat, feature_means = NULL, diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd new file mode 100644 index 00000000..f4bde193 --- /dev/null +++ b/r/man/normalize.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/transforms.R +\name{normalize_log} +\alias{normalize_log} +\alias{normalize_tfidf} +\title{Normalization recipes} +\usage{ +normalize_log(mat, scale_factor = 10000, threads = 1L) + +normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L) +} +\arguments{ +\item{mat}{(IterableMatrix) Counts matrix to normalize. \verb{(features x cells)}} + +\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.} + +\item{threads}{(integer) Number of threads to use.} + +\item{feature_means}{(numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then +each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. +Else, map each feature name to its mean value.} +} +\value{ +For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as: +\itemize{ +\item \code{normalize_log}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} +} + +\itemize{ +\item \code{normalize_tfidf}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} +} +} +\description{ +Apply standard normalizations to a \verb{(features x cells)} counts matrix. +} +\details{ +\itemize{ +\item \code{normalize_log}: Corresponds to \code{Seurat::NormalizeLog} +} + +\itemize{ +\item \code{normalize_tfidf}: This follows the formula from Stuart, Butler et al. 2019, used by default in \code{ArchR::addIterativeLSI()} and \code{Signac::RunTFIDF()} +} +} diff --git a/r/man/normalize_log.Rd b/r/man/normalize_log.Rd deleted file mode 100644 index 97d8c92e..00000000 --- a/r/man/normalize_log.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/transforms.R -\name{normalize_log} -\alias{normalize_log} -\title{Normalize a \verb{(features x cells)} matrix using log normalization.} -\usage{ -normalize_log(mat, scale_factor = 10000, threads = 1L) -} -\arguments{ -\item{mat}{(IterableMatrix) Matrix to normalize.} - -\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.} - -\item{threads}{(integer) Number of threads to use.s} -} -\value{ -log normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, -the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: -\eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} -} -\description{ -Normalize a \verb{(features x cells)} matrix using log normalization. -} diff --git a/r/man/normalize_tfidf.Rd b/r/man/normalize_tfidf.Rd deleted file mode 100644 index 8dc50b84..00000000 --- a/r/man/normalize_tfidf.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/transforms.R -\name{normalize_tfidf} -\alias{normalize_tfidf} -\title{Normalize a \verb{(features x cells)} matrix using term frequency-inverse document frequency.} -\usage{ -normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L) -} -\arguments{ -\item{mat}{(IterableMatrix) Matrix to normalize.} - -\item{feature_means}{(numeric) Means of the features to normalize by. If no names are provided, then -each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. -Else, map each feature name to its mean value.} - -\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.} - -\item{threads}{(integer) Number of threads to use.s} -} -\value{ -tf-idf normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, -the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: -\eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} -} -\description{ -Normalize a \verb{(features x cells)} matrix using term frequency-inverse document frequency. -} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index ae2772eb..52d03343 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -131,20 +131,14 @@ reference: - checksum - apply_by_row - regress_out - - normalize_log - - normalize_tfidf - IterableMatrix-methods - pseudobulk_matrix -- title: "Reference Annotations" +- title: "Single-cell analysis helpers" +- subtitle: "Dimensionality reduction" - contents: - - human_gene_mapping - - match_gene_symbol - - read_gtf - - read_bed - - read_ucsc_chrom_sizes - -- title: "Clustering" + - normalize_log +- subtitle: "Clustering" - contents: - knn_hnsw - cluster_graph_leiden @@ -182,3 +176,11 @@ reference: - discrete_palette - collect_features - rotate_x_labels + +- title: "Reference Annotations" +- contents: + - human_gene_mapping + - match_gene_symbol + - read_gtf + - read_bed + - read_ucsc_chrom_sizes From 553f262a604e4f366089ff9741e612b8f47821fe Mon Sep 17 00:00:00 2001 From: Ben Parks Date: Fri, 10 Jan 2025 16:51:58 -0800 Subject: [PATCH 6/9] Update NEWS --- r/NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/r/NEWS.md b/r/NEWS.md index 07890a0e..ac2109fb 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -10,6 +10,7 @@ Contributions welcome :) ## Features - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166) +- Add normalization helper functions `normalize_log()` and `normalize_tfidf()` (pull request #168) ## Bug-fixes - Fix error message printing when MACS crashes during `call_peaks_macs()` (pull request #175) From 7511f0b7e32b7ee7a726ddd73912ac272f3e0d79 Mon Sep 17 00:00:00 2001 From: Ben Parks Date: Fri, 10 Jan 2025 17:00:53 -0800 Subject: [PATCH 7/9] Update docs --- r/R/transforms.R | 2 +- r/man/normalize.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/transforms.R b/r/R/transforms.R index e79c404d..05759f9e 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -928,7 +928,7 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { # Normalizations ################# -#' Normalization recipes +#' Normalization helper functions #' #' Apply standard normalizations to a `(features x cells)` counts matrix. #' diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd index f4bde193..ac53f2c0 100644 --- a/r/man/normalize.Rd +++ b/r/man/normalize.Rd @@ -3,7 +3,7 @@ \name{normalize_log} \alias{normalize_log} \alias{normalize_tfidf} -\title{Normalization recipes} +\title{Normalization helper functions} \usage{ normalize_log(mat, scale_factor = 10000, threads = 1L) From 435724b4dbccdd9fefa89778a045f58b1b360bd7 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 13 Jan 2025 16:46:14 -0800 Subject: [PATCH 8/9] [r] add partial args to normalizations --- r/R/transforms.R | 17 +++++++++++++++++ r/tests/testthat/test-matrix_transforms.R | 3 +++ 2 files changed, 20 insertions(+) diff --git a/r/R/transforms.R b/r/R/transforms.R index 05759f9e..b097e898 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -943,6 +943,14 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { #' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog` #' @export normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) { + if (rlang::is_missing(mat)) { + return( + purrr::partial( + normalize_log, + scale_factor = scale_factor, threads = threads + ) + ) + } assert_is(mat, "IterableMatrix") assert_is_numeric(scale_factor) assert_greater_than_zero(scale_factor) @@ -963,6 +971,15 @@ normalize_tfidf <- function( mat, feature_means = NULL, scale_factor = 1e4, threads = 1L ) { + if (rlang::is_missing(mat)) { + return( + purrr::partial( + normalize_tfidf, + feature_means = feature_means, scale_factor = scale_factor, + threads = threads + ) + ) + } assert_is(mat, "IterableMatrix") assert_is_wholenumber(threads) # If feature means are passed in, only need to calculate term frequency diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R index 67641e54..24cd9c23 100644 --- a/r/tests/testthat/test-matrix_transforms.R +++ b/r/tests/testthat/test-matrix_transforms.R @@ -367,6 +367,8 @@ test_that("tf-idf normalization works", { res <- normalize_tfidf(m2) expect_equal(res %>% as("dgCMatrix"), res_dgc, tolerance = 1e-6) res_with_row_means <- normalize_tfidf(m2, feature_means = row_means) + res_with_row_means_partial <- normalize_tfidf(feature_means = row_means)(m2) + expect_equal(res_with_row_means, res_with_row_means_partial) expect_identical(res, res_with_row_means) res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled) @@ -386,5 +388,6 @@ test_that("normalize_log works", { # Test that changing scale factor works res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix") + res_2_partial <- as(normalize_log(scale_factor = 1e5)(m2), "dgCMatrix") expect_equal(res_2, log1p(res_dgc*1e5), tolerance = 1e-6) }) \ No newline at end of file From 8dbe8e52be1d5e79aa895deeb095fd80314c980c Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 13 Jan 2025 19:18:45 -0800 Subject: [PATCH 9/9] [r] create mechanism for partial calls on explicit args --- r/R/transforms.R | 13 ++++++------- r/R/utils.R | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/r/R/transforms.R b/r/R/transforms.R index b097e898..02ab51b8 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -943,11 +943,11 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { #' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog` #' @export normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) { + # browser() if (rlang::is_missing(mat)) { return( - purrr::partial( - normalize_log, - scale_factor = scale_factor, threads = threads + partial_explicit( + normalize_log, scale_factor = scale_factor, threads = threads ) ) } @@ -973,10 +973,9 @@ normalize_tfidf <- function( ) { if (rlang::is_missing(mat)) { return( - purrr::partial( - normalize_tfidf, - feature_means = feature_means, scale_factor = scale_factor, - threads = threads + partial_explicit( + normalize_tfidf, feature_means = feature_means, + scale_factor = scale_factor, threads = threads ) ) } diff --git a/r/R/utils.R b/r/R/utils.R index 4ea62d15..784f6106 100644 --- a/r/R/utils.R +++ b/r/R/utils.R @@ -56,4 +56,20 @@ log_progress <- function(msg, add_timestamp = TRUE){ } else { message(msg) } +} + +# Helper function to create partial explicit functions +# This builds upon purrr::partial by allowing for nested partial calls, where each partial call +# only does partial application of the arguments that were explicitly provided. +partial_explicit <- function(fn, ...) { + args <- rlang::enquos(...) + evaluated_args <- purrr::map(args, rlang::eval_tidy) + # Fetch the default arguments from the function definition + default_args <- formals(fn) + # Keep only explicitly provided arguments that were evaluated + # where the values are different from the default arguments + explicitly_passed_args <- evaluated_args[names(evaluated_args) %in% names(default_args) & + !purrr::map2_lgl(evaluated_args, default_args[names(evaluated_args)], identical)] + # Return a partially applied version of the function using evaluated arguments + return(purrr::partial(fn, !!!explicitly_passed_args)) } \ No newline at end of file