Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
149 commits
Select commit Hold shift + click to select a range
1ef6091
[r] add tf-idf and log normalization functions
immanuelazn Dec 12, 2024
98675d0
[r] fix normalization tests
immanuelazn Dec 12, 2024
2f83ae6
[r] add in requested changes
immanuelazn Dec 14, 2024
6381f74
[r] removed unused variable
immanuelazn Dec 14, 2024
8e80dc5
[r] add feature selection methods
immanuelazn Dec 15, 2024
c50ead2
[r] update select_features_by_dispersion() to reflect archr defaults
immanuelazn Jan 10, 2025
13c3760
[r] add lsi, var feature selection
immanuelazn Oct 31, 2024
36a8983
[r] add lsi, variable feature selection
immanuelazn Nov 4, 2024
2be2efe
[r] parametrize z_score_norm, create temp option to return more info …
immanuelazn Nov 7, 2024
dccc3a5
[r] add test case for LSI comparing to archr
immanuelazn Nov 7, 2024
183dd40
[r] clean up var gene selection, lsi docstring
immanuelazn Nov 7, 2024
4972f34
[r] add variable gene selection test
immanuelazn Nov 7, 2024
e4d5cb0
[r] provide more colour to scanpy feat selection test
immanuelazn Nov 7, 2024
99470e0
[r] cleanup real data tests
immanuelazn Nov 7, 2024
3bf8914
[r] clean up lsi, var features docstrings
immanuelazn Nov 8, 2024
a7c6179
[r] add in more lsi real data tests
immanuelazn Nov 8, 2024
acf35b2
[r] remove unused variable from `lsi()`
immanuelazn Nov 18, 2024
47256db
[r] add requested changes
immanuelazn Dec 2, 2024
004499a
[r] fix requested changes
immanuelazn Dec 2, 2024
dd80165
[r] fix lsi docstring, idf_ logic
immanuelazn Dec 3, 2024
8891981
[r] replace z-score norm with corr cutoffs
immanuelazn Dec 7, 2024
1e7c6d0
[r] update LSI to use norm, feature selection helpers
immanuelazn Jan 9, 2025
e9c302e
[r] update `NEWS.md`
immanuelazn Jan 10, 2025
7ed6bd7
[r] remove test artifacts
immanuelazn Jan 10, 2025
199ae82
Update docs
bnprks Jan 11, 2025
2911cf1
Merge branch 'main' into ia/normalizations
bnprks Jan 11, 2025
553f262
Update NEWS
bnprks Jan 11, 2025
7511f0b
Update docs
bnprks Jan 11, 2025
d67b7db
[r] add logging, partial args
immanuelazn Jan 13, 2025
435724b
[r] add partial args to normalizations
immanuelazn Jan 14, 2025
8dbe8e5
[r] create mechanism for partial calls on explicit args
immanuelazn Jan 14, 2025
21af3f9
Merge branch 'ia/normalizations' into ia/feature-selection
immanuelazn Jan 14, 2025
067b540
[r] add partial calls, update feature selection docs
immanuelazn Jan 14, 2025
5e49504
Update docs
bnprks Jan 11, 2025
453215f
[ci] add update to apt-get (#164)
immanuelazn Dec 17, 2024
3c99a01
[r] Fix articles index to include manuscript draft (#170)
bnprks Dec 17, 2024
5a8a335
[r] Fix type confusion in `pseudobulk_matrix()` and clean up `paralle…
bnprks Dec 21, 2024
8e0603b
[r] 0.3.0 release announcement (#177)
bnprks Dec 22, 2024
4391202
[r] Improve error printing in `call_peaks_macs` (#175)
bnprks Jan 9, 2025
f1232b0
[r][cpp] Support writing AnnData dense matrices (#166)
ycli1995 Jan 9, 2025
bb7f5e2
Update NEWS
bnprks Jan 11, 2025
765a0cb
Update docs
bnprks Jan 11, 2025
3a33209
[r] add partial args to normalizations
immanuelazn Jan 14, 2025
891868c
[r] create mechanism for partial calls on explicit args
immanuelazn Jan 14, 2025
00922d7
[r] add partial calls, update feature selection docs
immanuelazn Jan 14, 2025
4e27f5d
[r] add lsi, variable feature selection
immanuelazn Nov 4, 2024
ebebad4
[r] update LSI to use norm, feature selection helpers
immanuelazn Jan 9, 2025
b1ab04c
[r] add iterative LSI implementation
immanuelazn Jan 10, 2025
1ae19b2
Merge branch 'ia/feature-selection' into ia/lsi
immanuelazn Jan 18, 2025
76f4c7d
[r] change check for `pseudobulk_matrix()` to use whole number instea…
immanuelazn Jan 10, 2025
5e3a7fe
[r] update feature selection documentation
immanuelazn Jan 18, 2025
2e11ccd
Merge branch 'ia/lsi' into ia/lsi-iterative
immanuelazn Jan 18, 2025
16d5344
[r] reorder assertions, add new partial func system
immanuelazn Jan 24, 2025
87eb430
[r] change behaviour of num_feats default args, write docs
immanuelazn Jan 24, 2025
613b0df
Merge branch 'ia/feature-selection' into ia/lsi
immanuelazn Jan 24, 2025
6c4285b
[r] fix binned dispersion naming
immanuelazn Jan 25, 2025
69eea78
Merge branch 'ia/lsi' into ia/lsi-iterative
immanuelazn Jan 25, 2025
04f67f2
[r] change normalize text for feature selection
immanuelazn Jan 25, 2025
4d17557
[r] add iterative LSI
immanuelazn Jan 27, 2025
5289f39
[r] add rcpphnsw to imports
immanuelazn Jan 27, 2025
19e96d3
[r] fix num_feats logic in feature selection
immanuelazn Jan 27, 2025
eefb33d
[r] add blurb about partials in normalize
immanuelazn Jan 27, 2025
e010069
[r] update NEWS
immanuelazn Jan 27, 2025
1265088
[r] update feature selection docs
immanuelazn Jan 27, 2025
f0d9563
Merge remote-tracking branch 'origin/main' into ia/lsi-iterative
immanuelazn Feb 3, 2025
dd03ecc
[r] update partial code styling in feature selection, normalization
immanuelazn Feb 3, 2025
83d8877
[r] fix matrix flexibility in normalization, feature selection, lsi
immanuelazn Feb 3, 2025
1093cf5
docs updates
bnprks Feb 6, 2025
82c2ec7
[r] add in timestamp logging, fix partial creation
immanuelazn Feb 7, 2025
d9328c0
[r] update docs
immanuelazn Feb 7, 2025
c926219
[r] add in additional function for matrix coercibility
immanuelazn Feb 7, 2025
eaeb56f
[r] update NEWS
immanuelazn Feb 7, 2025
b6a13dd
[r] update feature selcection/normalization with PR suggestions
immanuelazn Feb 7, 2025
74e12b7
[r] change matrix checks in feat selection, normalization, update nor…
immanuelazn Feb 7, 2025
c8b73a2
[r] add docs changes to DimReduction, allow for knn parameterization …
immanuelazn Feb 7, 2025
a3dd8a3
[r] change feature selection column name from names to feature
immanuelazn Feb 8, 2025
edd4dfe
[r] change iterative LSI defaults
immanuelazn Feb 8, 2025
ff8fb58
[r] update `select_features_mean()` to be caleld `select_features_acc…
immanuelazn Feb 10, 2025
9c71065
[r] update mat assertion, LSI docs
immanuelazn Feb 10, 2025
b6e5a88
[r] remove verbose flag from feature selection
immanuelazn Feb 10, 2025
157e26e
[r] change rcpphnsw pkg for imports to suggests
immanuelazn Feb 10, 2025
0917f26
[r] fix assert_is_mat, add tests
immanuelazn Feb 11, 2025
82752eb
[r] fix styling on `assert_is_mat()`
immanuelazn Feb 12, 2025
3a5a9d4
[r] add docs changes to normalize/binarize, give binarize partials
immanuelazn Feb 13, 2025
f97cb51
[r] update clustering to allow for single step processing
immanuelazn Feb 13, 2025
225ef00
[r] revert select_features_accessibility to be mean, change LSI to wo…
immanuelazn Feb 13, 2025
293f8c4
[r] change lsi clustering to work with github actions
immanuelazn Feb 13, 2025
d213e39
[r] fix clustering partials for knn_annoy
immanuelazn Feb 13, 2025
68234f0
[r] change clustering partials to use matrix/list type
immanuelazn Feb 14, 2025
5aa8fc2
[r] update feature selection example
immanuelazn Feb 18, 2025
84fef23
[r] rewrite `partial_apply()` defaults, fix up docs
immanuelazn Feb 18, 2025
7351a20
[r] clean up dimreductiondocs
immanuelazn Feb 18, 2025
8ddf94a
[r] change wrong `missing_args_error` flag on LSI
immanuelazn Feb 18, 2025
b530d13
[r] remove missing_args
immanuelazn Feb 18, 2025
e0952de
[r] remove first_feature_selection argument in `IterativeLSI`
immanuelazn Feb 18, 2025
c57945e
[r] revert removal of `.overwrite` in `partial_apply()`
immanuelazn Feb 18, 2025
11fec1a
[r] cleanup lsi, iterative lsi docs
immanuelazn Feb 18, 2025
882d74d
[r] change default feature selection for `IterativeLSI()` to var
immanuelazn Feb 18, 2025
732c800
[r] fill out `is_knn_matrix()` docs
immanuelazn Feb 19, 2025
a5b200b
[r] make feature selection use `normalize_method` argument
immanuelazn Feb 20, 2025
f1104bf
[r] fix removal of pcs in `IterativeLSI()`, documentation
immanuelazn Feb 20, 2025
b98518a
[r] clean up `IterativeLSI()`
immanuelazn Feb 20, 2025
acc0964
[r] globally rename knn matrix to knn object
immanuelazn Feb 20, 2025
24d7e69
[r] update clustering functions to use `mat` instead of `snn`
immanuelazn Feb 20, 2025
b3ba6a3
update NEWS.md
immanuelazn Feb 20, 2025
0e6b176
[r] add threads and verbose arguments to cluster functions
immanuelazn Feb 20, 2025
7ce2ba2
[r] add ability to project into different iterations of `IterativeLSI()`
immanuelazn Feb 20, 2025
280e769
[r] re-change knn matrix to knn object
immanuelazn Feb 20, 2025
3fafebb
[r] tidy up `DimReduction` docs styling
immanuelazn Feb 20, 2025
ebe840b
[r] change `is_knn_matrix()` to `is_knn_object()`
immanuelazn Feb 20, 2025
1dfeb15
[r] change docs styling for `IterativeLSI()`
immanuelazn Feb 20, 2025
d15a0e5
[r] transpose cell emebeddings in `DimReduction`
immanuelazn Feb 21, 2025
df64069
[r] add pc removal tests for `LSI()`, make `LSI()` work when only one…
immanuelazn Feb 21, 2025
6c248e0
[r] expand `IterativeLSI()` docs
immanuelazn Feb 21, 2025
4ce1c61
[r] general docs and code cleanup
immanuelazn Feb 24, 2025
17d9d96
[r] add `RcppAnnoy` dependency check and requirement
immanuelazn Feb 24, 2025
f8f4c04
[r] clean up feature selection docs
immanuelazn Feb 24, 2025
810adac
[r] more docs cleanups
immanuelazn Feb 24, 2025
4af4e89
[r] add requested pr changes
immanuelazn Feb 25, 2025
26597c8
[r] redo normalize examples for clarity
immanuelazn Feb 25, 2025
6437793
[r] add inn missing rd for `convert_mat_to_cluster_matrix()`
immanuelazn Feb 27, 2025
1030f77
[r] add more comparisons between `IterativeLSI()` and ArchR's impleme…
immanuelazn Feb 27, 2025
019de7f
[r] fix miscellaneous syntax/reference inconsistencies for clustering
immanuelazn Feb 28, 2025
4cbf147
[r] fix `knn_obj_method` docs inconsistency, fix typo for `ef` param …
immanuelazn Feb 28, 2025
a2ee2c6
[r] update `knn_obj_method` documentation
immanuelazn Feb 28, 2025
49c497d
[r] fix math wording in `feature_selection`
immanuelazn Feb 28, 2025
cf108a8
[r] clean up normalizations code, docs
immanuelazn Feb 28, 2025
beaef0a
[r] remove knn method from graph adjacency functions.
immanuelazn Mar 1, 2025
307b2c5
[r] clean up `binarize()` examples
immanuelazn Mar 1, 2025
28bb79e
[r] rename `cluster` Rd to `cluster_graph`
immanuelazn Mar 1, 2025
e89b467
[r] clean up lsi code clarity, documentation
immanuelazn Mar 3, 2025
8de044e
[r] update LSI documentation
immanuelazn Mar 3, 2025
adb1749
[r] expand docstrings for `IterativeLSI()`
immanuelazn Mar 3, 2025
80e2ad2
[r] clean up `knn_to_graph_method` param in `cluster_graph*()` functions
immanuelazn Mar 3, 2025
a737385
[r] clean up `select_features*()` documentation
immanuelazn Mar 3, 2025
6de7df5
[r] revert broken case in `normalize_tfidf()` with non-null row means
immanuelazn Mar 4, 2025
aa5d9a2
[r] remove functionality of `assert_is_mat()` when passed multiple ma…
immanuelazn Mar 4, 2025
1b588dd
[r] clean up svd call in `LSI()`
immanuelazn Mar 4, 2025
4016ddf
[r] add in POC clustering wrapper
immanuelazn Mar 5, 2025
64b6df4
[r] change clustering functions to only accept graph adjacency matrix
immanuelazn Mar 5, 2025
701f7b4
[r] cleanup mat handling during `LSI()`
immanuelazn Mar 5, 2025
89ce4c0
[r] cleanup `cluster_cells_graph()` title
immanuelazn Mar 5, 2025
e16890f
[r] remove partialized binarization, clean normalization documentation
immanuelazn Mar 19, 2025
f5d0129
[r] clean up feature selection documentation, add additional dim chec…
immanuelazn Mar 19, 2025
75017f2
[r] add initial examples for `DimReduction` subclasses
immanuelazn Apr 7, 2025
03ed4f9
[r] add example updates to `LSI()` `IterativeLSI()` `feature_selectio…
immanuelazn Apr 10, 2025
88f9c20
[r] add `print` method for dim reductions (#115)
immanuelazn Apr 15, 2025
0dd291b
[r] add missing `feature_names` attr to `IterativeLSI()` documentation
immanuelazn Apr 17, 2025
1851026
[r] remove commented out code in `print.Dimreduction`
immanuelazn Apr 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion r/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ Suggests:
IRanges,
GenomicRanges,
matrixStats,
igraph
igraph,
RcppHNSW,
RcppAnnoy
Depends:
R (>= 3.5.0)
Config/Needs/website: pkgdown, devtools, uwot, irlba, RcppHNSW, igraph, BiocManager, bioc::BSgenome.Hsapiens.UCSC.hg38, github::GreenleafLab/motifmatchr, github::GreenleafLab/chromVARmotifs
15 changes: 15 additions & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,18 @@

S3method(base::as.data.frame,IterableFragments)
S3method(base::as.matrix,IterableMatrix)
S3method(print,DimReduction)
S3method(project,IterativeLSI)
S3method(project,LSI)
S3method(project,default)
S3method(svds,IterableMatrix)
S3method(svds,default)
export("all_matrix_inputs<-")
export("cellNames<-")
export("chrNames<-")
export(DimReduction)
export(IterativeLSI)
export(LSI)
export(add_cols)
export(add_rows)
export(all_matrix_inputs)
Expand All @@ -20,6 +27,7 @@ export(canonical_gene_symbol)
export(cellNames)
export(checksum)
export(chrNames)
export(cluster_cells_graph)
export(cluster_graph_leiden)
export(cluster_graph_louvain)
export(cluster_graph_seurat)
Expand Down Expand Up @@ -65,6 +73,8 @@ export(min_by_row)
export(min_scalar)
export(multiply_cols)
export(multiply_rows)
export(normalize_log)
export(normalize_tfidf)
export(nucleosome_counts)
export(open_fragments_10x)
export(open_fragments_dir)
Expand All @@ -83,6 +93,7 @@ export(plot_tf_footprint)
export(plot_tss_profile)
export(plot_tss_scatter)
export(prefix_cell_names)
export(project)
export(pseudobulk_matrix)
export(qc_scATAC)
export(range_distance_to_nearest)
Expand All @@ -108,6 +119,10 @@ export(rowVars.default)
export(sctransform_pearson)
export(select_cells)
export(select_chromosomes)
export(select_features_binned_dispersion)
export(select_features_dispersion)
export(select_features_mean)
export(select_features_variance)
export(select_regions)
export(set_trackplot_height)
export(set_trackplot_label)
Expand Down
9 changes: 9 additions & 0 deletions r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,17 @@ Contributions welcome :)

# BPCells 0.3.1 (in-progress main branch)

## Breaking changes
- Change first parameter name of `cluster_graph_leiden()`, `cluster_graph_louvain()` and `cluster_graph_seurat()` from `snn` to `mat` to more accurately reflect the input type. (pull request #189)

## Features
- Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166)
- Add normalization helper functions `normalize_log()` and `normalize_tfidf()` (pull request #168)
- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #189)
- Add feature selection functions `select_features_variance()`, and `select_features_{dispersion,mean,binned_dispersion}()`, with parameterization for normalization steps, and number of variable features (pull request #189)
- Add `LSI()` and `IterativeLSI()` dimensionality functions to perform latent semantic indexing on a matrix (pull request #189).
- Add capability to create partial function objects in when excluding the first argument of a function. This is implemented in normalizations, feature selections, dimensionality reductions, and clustering functions. See `select_features_variance()` for usage. (pull request #189)
- Create a wrapper function `cluster_cells_graph()` that wraps the steps of knn object creation, graph adjacency creation, and clustering all within a single function (pull request #189)

## Improvements
- Speed up taking large subsets of large concatenated matrices, e.g. selecting 9M cells from a 10M cell matrix composed of ~100 concatenated pieces. (pull request #179)
Expand Down
151 changes: 129 additions & 22 deletions r/R/clustering.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,84 @@
# option. This file may not be copied, modified, or distributed
# except according to those terms.


#' Check if an input is a kNN object
#'
#' knn object functions `knn_hnsw()` and `knn_annoy()` return a list of two matrices, `idx` and `dist`.
#' These are used as inputs to create graph adjacency matrices for clustering.
#' Assume any list with at least both `idx` and `dist` items is a kNN object.
#' @return TRUE if the mat is a knn object, FALSE otherwise
#' @keywords internal
is_knn_object <- function(mat) {
return(is(mat, "list") && all(c("idx", "dist") %in% names(mat)))
}

#' Check if an input is a graph adjacency matrix.
#'
#' Clustering functions like `cluster_graph_leiden()` and `cluster_graph_louvain()` require a graph adjacency matrix as input.
#' We assume that any square `dgCMatrix` is a graph adjacency matrix.
#' @return TRUE if the mat is a graph adjacency matrix, FALSE otherwise
#' @keywords internal
is_adjacency_matrix <- function(mat) {
return(is(mat, "dgCMatrix") && nrow(mat) == ncol(mat))
}

#' Cluster embeddings using a KNN-Graph based algorithm
#'
#' Take in a cell embedding matrix, and sequentially convert it into a kNN object, then to
#' a graph adjacency matrix. Following, assign a label to every cell using a clustering algorithm.
#'
#' @param mat (matrix) Cell embeddings matrix of shape `(cells x n_embeddings)`
#' @param knn_method (function) Function to convert cell embeddings into a knn object.
#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`.
#' @param knn_to_graph_method (function) Function to convert the knn object returned from `knn_method` to a graph adjacency matrix.
#' Must be a (optionally partialized) version of `knn_to_graph()`, `knn_to_snn_graph()` or `knn_to_geodesic_graph()`.
#' @param graph_to_cluster_method (function) Clustering algorithm that converts a graph adjacency matrix
#' returned from `graph_to_cluster_method` into cluster labels for each cell.
#' Must be a (optionally partialized) version of `cluster_graph_leiden()`, `cluster_graph_louvain()` or `cluster_graph_seurat()`.
#' @param threads (integer) Number of threads to use in `knn_method`, `knn_to_graph_method` and `graph_to_cluster_method`. If these functions do not utilize
#' a `threads` argument, this is silently ignored.
#' @param verbose (logical) Whether to print progress information in `knn_method`, `knn_to_graph_method` and `graph_to_cluster_method`. If these functions do not utilize
#' a `verbose` argument, this is silently ignored.
#' @returns (factor) Factor vector containing the cluster assignment for each cell.
#' @details
#' `cluster_cells_graph()` acts as a helper function to wrap input creation and `kNN` graph adjacency-based clustering to be done together. The user
#' can also manually pass cell embeddings to their preferred knn/clustering functions of choices.
#'
#' **Clustering customization through partialized parameters**
#'
#' Customization of clustering is possible through partialization of each parameter in `cluster_cells_graph()` that is a function.
#' In detail, each parameter that requests a function
#' may take in one with only some of the arguments provided. If the first argument is not provided, a copy of a function is utilized that has its parameters
#' changed with the arguments provided.
#'
#' For instance, if the user desires for `cluster_cells_graph()` to instead use `cluster_graph_louvain()` with resolution different than the default,
#' they can instead call `cluster_cells_graph()` like so:
#' `cluster_cells_graph(mat, graph_to_cluster_method = cluter_graph_louvain(resolution = 0.5))`
#' @seealso `knn_hnsw()` `knn_annoy()` `knn_to_graph()` `knn_to_snn_graph()` `knn_to_geodesic_graph()` `cluster_graph_leiden()` `knn_to_snn_graph()` `knn_to_geodesic_graph()`
#' @export
cluster_cells_graph <- function(
mat, knn_method = knn_hnsw,
knn_to_graph_method = knn_to_geodesic_graph,
graph_to_cluster_method = cluster_graph_leiden,
threads = 0L, verbose = FALSE
) {
assert_is_wholenumber(threads)
assert_is(verbose, "logical")
if (rlang::is_missing(mat)) return(create_partial())
assert_is(mat, "matrix")
# There currently aren't any `knn_to_graph` functions that utilize a verbose argument.
# However, we still pass `verbose` in case future functions do provide this functionality.
mat <- partial_apply(knn_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
if (!is_knn_object(mat)) pretty_error(mat, "`knn_method` was unable to convert `mat` into a knn object", 1)
# Return type has to be constrained to "matrix", so this is silently provided.
mat <- partial_apply(knn_to_graph_method, threads = threads, verbose = verbose, return_type = "matrix", .missing_args_error = FALSE)(mat)
if (!is_adjacency_matrix(mat)) pretty_error(mat, "`knn_to_graph_method` was unable to convert `mat` from a knn object to a graph adjacency matrix", 1)
# Also pass verbose and threads to clustering functions in case they are given these params in the future
mat <- partial_apply(graph_to_cluster_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
return(mat)
}

#' K Nearest Neighbor (KNN) Graph
#'
#' Convert a KNN object (e.g. returned by `knn_hnsw()` or `knn_annoy()`) into
Expand All @@ -22,6 +100,11 @@
#' Sparse matrix (dgCMatrix) where `mat[i,j]` = distance from cell `i` to
#' cell `j`, or 0 if cell `j` is not in the K nearest neighbors of `i`
knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) {
assert_is(use_weights, "logical")
assert_is(self_loops, "logical")

if (rlang::is_missing(knn)) return(create_partial())
assert_true(is_knn_object(knn))
if (use_weights) {
weights <- knn$dist
} else {
Expand All @@ -41,7 +124,6 @@ knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) {
mat
}


#' @rdname knn_graph
#' @details **knn_to_snn_graph**
#' Convert a knn object into a shared nearest neighbors adjacency matrix.
Expand All @@ -63,6 +145,9 @@ knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) {
#' @export
knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_type=c("matrix", "list")) {
return_type <- match.arg(return_type)
assert_is(self_loops, "logical")
if (rlang::is_missing(knn)) return(create_partial())
assert_true(is_knn_object(knn))
# Solve x / (2*K - x) >= min_val --> x >= 2*K*min_val / (1 + min_val)
min_int <- ceiling(2*min_val*ncol(knn$idx) / (1 + min_val))
snn <- build_snn_graph_cpp(knn$idx, min_neighbors = min_int)
Expand All @@ -83,13 +168,15 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t
}

# Return as a sparse matrix
Matrix::sparseMatrix(
res <- Matrix::sparseMatrix(
i = snn$i + 1L, j = snn$j + 1L, x = snn$weight,
dims = c(snn$dim, snn$dim)
)
return(res)
}

#' @rdname knn_graph
#' @param threads Number of threads to use during calculations
#' @details **knn_to_geodesic_graph**
#' Convert a knn object into an undirected weighted graph, using the same
#' geodesic distance estimation method as the UMAP package.
Expand All @@ -101,7 +188,6 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t
#' neighbor, results may differ slightly from `umap._umap.fuzzy_simplicial_set`, which
#' assumes self is always successfully found in the approximate nearest neighbor search.
#'
#' @param threads Number of threads to use during calculations
#' @return **knn_to_geodesic_graph**
#' - `return_type == "matrix"`:
#' Sparse matrix (dgCMatrix) where `mat[i,j]` = normalized similarity between cell `i` and cell `j`.
Expand All @@ -111,74 +197,87 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t
#' These correspond to the rows, cols, and values of non-zero entries in the lower triangle
#' adjacency matrix. `dim` is the total number of vertices (cells) in the graph
#' @export
knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=0L) {
knn_to_geodesic_graph <- function(knn, return_type = c("matrix", "list"), threads = 0L) {
return_type <- match.arg(return_type)
assert_is_wholenumber(threads)
if (rlang::is_missing(knn)) return(create_partial())
assert_true(is_knn_object(knn))
graph <- build_umap_graph_cpp(knn$dist, knn$idx, threads=threads)

graph$dim <- nrow(knn$idx)
if (return_type == "list") {
return(graph)
}
if (return_type == "list") return(graph)

# Return as a sparse matrix
Matrix::sparseMatrix(
res <- Matrix::sparseMatrix(
i = graph$i + 1L, j = graph$j + 1L, x = graph$weight,
dims = c(graph$dim, graph$dim)
)
return(res)
}

#' Cluster an adjacency matrix
#' @rdname cluster
#' @rdname cluster_graph
#' @details **cluster_graph_leiden**: Leiden clustering algorithm `igraph::cluster_leiden()`.
#' Note that when using `objective_function = "CPM"` the number of clusters empirically scales with `cells * resolution`,
#' so 1e-3 is a good resolution for 10k cells, but 1M cells is better with a 1e-5 resolution. A resolution of 1 is a
#' good default when `objective_function = "modularity"` per the default.
#' @param snn Symmetric adjacency matrix (dgCMatrix) output from e.g. `knn_to_snn_graph()` or `knn_to_geodesic_graph()`. Only the lower triangle is used
#' @param mat Symmetric adjacency matrix (dgCMatrix) output from e.g. `knn_to_snn_graph()` or `knn_to_geodesic_graph()`. Only the lower triangle is used.
#' @param resolution Resolution parameter. Higher values result in more clusters
#' @param objective_function Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below).
#' For the meaning of each option, see `igraph::cluster_leiden()`.
#' @param seed Random seed for clustering initialization
#' @param ... Additional arguments to underlying clustering function
#' @return Factor vector containing the cluster assignment for each cell.
#' @export
cluster_graph_leiden <- function(snn, resolution = 1, objective_function = c("modularity", "CPM"), seed = 12531, ...) {
cluster_graph_leiden <- function(
mat, resolution = 1, objective_function = c("modularity", "CPM"),
seed = 12531, ...
) {
assert_has_package("igraph")
# Set seed without permanently changing seed state
if (rlang::is_missing(mat)) return(create_partial())
prev_seed <- get_seed()
on.exit(restore_seed(prev_seed), add = TRUE)
set.seed(seed)

objective_function <- match.arg(objective_function)

igraph::graph_from_adjacency_matrix(snn, weighted = TRUE, diag = FALSE, mode = "lower") %>%
igraph::graph_from_adjacency_matrix(mat, weighted = TRUE, diag = FALSE, mode = "lower") %>%
igraph::cluster_leiden(resolution_parameter = resolution, objective_function=objective_function, ...) %>%
igraph::membership() %>%
as.factor()
}


#' @rdname cluster
#' @rdname cluster_graph
#' @details **cluster_graph_louvain**: Louvain graph clustering algorithm `igraph::cluster_louvain()`
#' @export
cluster_graph_louvain <- function(snn, resolution = 1, seed = 12531) {
cluster_graph_louvain <- function(
mat, resolution = 1, seed = 12531
) {
assert_has_package("igraph")
# Set seed without permanently changing seed state
if (rlang::is_missing(mat)) return(create_partial())

prev_seed <- get_seed()
on.exit(restore_seed(prev_seed), add = TRUE)
set.seed(seed)

igraph::graph_from_adjacency_matrix(snn, weighted = TRUE, diag = FALSE, mode = "lower") %>%
igraph::graph_from_adjacency_matrix(mat, weighted = TRUE, diag = FALSE, mode = "lower") %>%
igraph::cluster_louvain(resolution = resolution) %>%
igraph::membership() %>%
as.factor()
}

#' @rdname cluster
#' @rdname cluster_graph
#' @details **cluster_graph_seurat**: Seurat's clustering algorithm `Seurat::FindClusters()`
#' @export
cluster_graph_seurat <- function(snn, resolution = 0.8, ...) {
cluster_graph_seurat <- function(
mat, resolution = 0.8, ...
) {
assert_has_package("Seurat")
Seurat::as.Graph(snn) %>%
if (rlang::is_missing(mat)) return(create_partial())
Seurat::as.Graph(mat) %>%
Seurat::FindClusters(resolution = resolution, ...) %>%
.[[1]]
}
Expand Down Expand Up @@ -213,7 +312,7 @@ cluster_membership_matrix <- function(groups, group_order = NULL) {
}


#' Get a knn matrix from reduced dimensions
#' Get a knn object from reduced dimensions
#'
#' Search for approximate nearest neighbors between cells in the reduced
#' dimensions (e.g. PCA), and return the k nearest neighbors (knn) for each
Expand All @@ -228,7 +327,7 @@ cluster_membership_matrix <- function(groups, group_order = NULL) {
#' @param metric distance metric to use
#' @param threads Number of threads to use. Note that result is non-deterministic
#' if threads > 1
#' @param ef ef parameter for RccppHNSW::hnsw_search. Increase for slower search but
#' @param ef ef parameter for `RcppHNSW::hnsw_search()`. Increase for slower search but
#' improved accuracy
#' @param verbose whether to print progress information during search
#' @return List of 2 matrices -- idx for cell x K neighbor indices,
Expand All @@ -238,6 +337,10 @@ cluster_membership_matrix <- function(groups, group_order = NULL) {
#' @export
knn_hnsw <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine"), verbose = TRUE, threads = 1, ef = 100) {
metric <- match.arg(metric)
assert_is(verbose, "logical")
assert_is_wholenumber(threads)
assert_has_package("RcppHNSW")
if (rlang::is_missing(data)) return(create_partial())
index <- RcppHNSW::hnsw_build(
data,
distance = metric,
Expand Down Expand Up @@ -271,8 +374,11 @@ knn_hnsw <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine
#' @param n_trees Number of trees during index build time. More trees gives higher accuracy
#' @param search_k Number of nodes to inspect during the query, or -1 for default value. Higher number gives higher accuracy
#' @export
knn_annoy <- function(data, query = data, k = 10, metric = c("euclidean", "cosine", "manhattan", "hamming"), n_trees = 50, search_k = -1) {
knn_annoy <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine", "manhattan", "hamming"), n_trees = 50, search_k = -1) {
metric <- match.arg(metric)
assert_has_package("RcppAnnoy")
if (rlang::is_missing(data)) return(create_partial())
if (is.null(query)) query <- data
annoy <- switch(metric,
"euclidean" = new(RcppAnnoy::AnnoyEuclidean, ncol(data)),
"cosine" = new(RcppAnnoy::AnnoyAngular, ncol(data)),
Expand All @@ -294,5 +400,6 @@ knn_annoy <- function(data, query = data, k = 10, metric = c("euclidean", "cosin
dist[i, ] <- res$dist
}
if (metric == "cosine") dist <- 0.5 * (dist * dist)
list(idx = idx, dist = dist)
res <- list(idx = idx, dist = dist)
return(res)
}
7 changes: 6 additions & 1 deletion r/R/errorChecking.R
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,12 @@ assert_is <- function(object, class, n = 1) {
if (!match) pretty_error(object, sprintf("must have class %s", paste0(class, collapse = ", or ")), n)
}
}

assert_is_mat <- function(object, n = 1) {
# matrices have length set to row*col instead of being 1, so we need to check dim as well
if (!canCoerce(object, "IterableMatrix")) {
pretty_error(object, "must either be an IterableMatrix or coercible to an IterableMatrix", n)
}
}
assert_true <- function(expr, n = 1) {
if (!expr) pretty_error(expr, "is not true", n)
}
Expand Down
Loading