Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ export(min_by_row)
export(min_scalar)
export(multiply_cols)
export(multiply_rows)
export(normalize_log)
export(normalize_tfidf)
export(nucleosome_counts)
export(open_fragments_10x)
export(open_fragments_dir)
Expand Down
2 changes: 2 additions & 0 deletions r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Contributions welcome :)

## Features
- Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166)
- Add normalization helper functions `normalize_log()` and `normalize_tfidf()` (pull request #168)

## Bug-fixes
- Fix error message printing when MACS crashes during `call_peaks_macs()` (pull request #175)
Expand Down Expand Up @@ -39,6 +40,7 @@ of the new features this release and will continue to help with maintenance and
- Add `rowQuantiles()` and `colQuantiles()` functions, which return the quantiles of each row/column of a matrix. Currently `rowQuantiles()` only works on row-major matrices and `colQuantiles()` only works on col-major matrices.
If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128)
- Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168)

## Improvements
- `trackplot_loop()` now accepts discrete color scales
Expand Down
81 changes: 81 additions & 0 deletions r/R/transforms.R
Original file line number Diff line number Diff line change
Expand Up @@ -923,3 +923,84 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
vars_to_regress = vars_to_regress
)
}

#################
# Normalizations
#################

#' Normalization helper functions
#'
#' Apply standard normalizations to a `(features x cells)` counts matrix.
#'
#' @rdname normalize
#' @param mat (IterableMatrix) Counts matrix to normalize. `(features x cells)`
#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization.
#' @param threads (integer) Number of threads to use.
#' @returns For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
#' transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as:
#'
#' - `normalize_log`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
#' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog`
#' @export
normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
# browser()
if (rlang::is_missing(mat)) {
return(
partial_explicit(
normalize_log, scale_factor = scale_factor, threads = threads
)
)
}
assert_is(mat, "IterableMatrix")
assert_is_numeric(scale_factor)
assert_greater_than_zero(scale_factor)
read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
mat <- mat %>% multiply_cols(1 / read_depth)
return(log1p(mat * scale_factor))
}


#' @rdname normalize
#' @param feature_means (numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then
#' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
#' Else, map each feature name to its mean value.
#' @returns - `normalize_tfidf`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
#' @details - `normalize_tfidf`: This follows the formula from Stuart, Butler et al. 2019, used by default in `ArchR::addIterativeLSI()` and `Signac::RunTFIDF()`
#' @export
normalize_tfidf <- function(
mat, feature_means = NULL,
scale_factor = 1e4, threads = 1L
) {
if (rlang::is_missing(mat)) {
return(
partial_explicit(
normalize_tfidf, feature_means = feature_means,
scale_factor = scale_factor, threads = threads
)
)
}
assert_is(mat, "IterableMatrix")
assert_is_wholenumber(threads)
# If feature means are passed in, only need to calculate term frequency
if (is.null(feature_means)) {
mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
feature_means <- mat_stats$row_stats["mean", ]
read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
} else {
assert_is_numeric(feature_means)
if (!is.null(names(feature_means)) && !is.null(rownames(mat))) {
# Make sure every name in feature means exists in rownames(mat)
# In the case there is a length mismatch but the feature names all exist in feature_means
# will not error out
assert_true(all(rownames(mat) %in% names(feature_means)))
feature_means <- feature_means[rownames(mat)]
} else {
assert_len(feature_means, nrow(mat))
}
read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
}
tf <- mat %>% multiply_cols(1 / read_depth)
idf <- 1 / feature_means
tf_idf_mat <- tf %>% multiply_rows(idf)
return(log1p(tf_idf_mat * scale_factor))
}
16 changes: 16 additions & 0 deletions r/R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,20 @@ log_progress <- function(msg, add_timestamp = TRUE){
} else {
message(msg)
}
}

# Helper function to create partial explicit functions
# This builds upon purrr::partial by allowing for nested partial calls, where each partial call
# only does partial application of the arguments that were explicitly provided.
partial_explicit <- function(fn, ...) {
args <- rlang::enquos(...)
evaluated_args <- purrr::map(args, rlang::eval_tidy)
# Fetch the default arguments from the function definition
default_args <- formals(fn)
# Keep only explicitly provided arguments that were evaluated
# where the values are different from the default arguments
explicitly_passed_args <- evaluated_args[names(evaluated_args) %in% names(default_args) &
!purrr::map2_lgl(evaluated_args, default_args[names(evaluated_args)], identical)]
# Return a partially applied version of the function using evaluated arguments
return(purrr::partial(fn, !!!explicitly_passed_args))
}
45 changes: 45 additions & 0 deletions r/man/normalize.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 17 additions & 8 deletions r/pkgdown/_pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ template:
includes:
in_header: |
<script defer data-domain="benparks.net" src="https://plausible.benparks.net/js/visit-counts.js"></script>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.15/dist/katex.min.css" integrity="sha384-Htz9HMhiwV8GuQ28Xr9pEs1B4qJiYu/nYLLwlDklR53QibDfmQzi7rYxXhMH/5/u" crossorigin="anonymous">
<!-- The loading of KaTeX is deferred to speed up page rendering -->
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.15/dist/katex.min.js" integrity="sha384-bxmi2jLGCvnsEqMuYLKE/KsVCxV3PqmKeK6Y6+lmNXBry6+luFkEOsmp5vD9I/7+" crossorigin="anonymous"></script>
<!-- To automatically render math in text elements, include the auto-render extension: -->
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.15/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous" onload="renderMathInElement(document.body);"></script>
after_body: |
<img src="https://plausible.benparks.net/flask-plausible/bpcells-docs.png" style="position:absolute;" />

Expand Down Expand Up @@ -130,15 +135,11 @@ reference:
- IterableMatrix-methods
- pseudobulk_matrix

- title: "Reference Annotations"
- title: "Single-cell analysis helpers"
- subtitle: "Dimensionality reduction"
- contents:
- human_gene_mapping
- match_gene_symbol
- read_gtf
- read_bed
- read_ucsc_chrom_sizes

- title: "Clustering"
- normalize_log
- subtitle: "Clustering"
- contents:
- knn_hnsw
- cluster_graph_leiden
Expand Down Expand Up @@ -176,3 +177,11 @@ reference:
- discrete_palette
- collect_features
- rotate_x_labels

- title: "Reference Annotations"
- contents:
- human_gene_mapping
- match_gene_symbol
- read_gtf
- read_bed
- read_ucsc_chrom_sizes
45 changes: 45 additions & 0 deletions r/tests/testthat/test-matrix_transforms.R
Original file line number Diff line number Diff line change
Expand Up @@ -346,3 +346,48 @@ test_that("linear regression works", {
expect_equal(as(m1, "matrix"), ans)
expect_equal(as(m1t, "matrix"), ans)
})

test_that("tf-idf normalization works", {
m <- generate_sparse_matrix(5, 5)
rownames(m) <- paste0("row", seq_len(nrow(m)))
rev_rownames <- rev(rownames(m))
# Create tf-idf normalization for dgCMatrix
res_dgc <- log1p((diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix")) * 1e4)

rownames(res_dgc) <- rownames(m)
m2 <- as(m, "IterableMatrix")
# Check that we can pass in row means as a (named) vector
row_means <- matrix_stats(m2, row_stats = c("mean"))$row_stats["mean",]
# Test that row means ordering does not matter as long as names exist
row_means_shuffled <- row_means[sample(1:length(row_means))]
# Test that row means can have an extra element as long as all rownames are in the vector
row_means_plus_one <- c(row_means, row6 = 1)


res <- normalize_tfidf(m2)
expect_equal(res %>% as("dgCMatrix"), res_dgc, tolerance = 1e-6)
res_with_row_means <- normalize_tfidf(m2, feature_means = row_means)
res_with_row_means_partial <- normalize_tfidf(feature_means = row_means)(m2)
expect_equal(res_with_row_means, res_with_row_means_partial)
expect_identical(res, res_with_row_means)

res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled)
expect_identical(res_with_row_means, res_with_shuffled_row_means, res)

res_with_row_means_with_extra_element <- normalize_tfidf(m2, feature_means = row_means_plus_one)
expect_identical(res, res_with_row_means_with_extra_element)
})

test_that("normalize_log works", {
m <- generate_sparse_matrix(5, 5)
res_dgc <- m %*% diag(1/colSums(m)) %>% as("dgCMatrix")
m2 <- as(m, "IterableMatrix")
# Test that default params yield the same as log1p on dgCMatrix
res_1 <- as(normalize_log(m2), "dgCMatrix")
expect_equal(res_1, log1p(res_dgc*1e4), tolerance = 1e-6)

# Test that changing scale factor works
res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix")
res_2_partial <- as(normalize_log(scale_factor = 1e5)(m2), "dgCMatrix")
expect_equal(res_2, log1p(res_dgc*1e5), tolerance = 1e-6)
})