diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index 69712b2..ecd6a9f 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -14,12 +14,10 @@ jobs: config: - { os: ubuntu-latest, r: '4.5' } - { os: ubuntu-latest, r: '4.4' } - - { os: ubuntu-latest, r: '4.3' } - - { os: ubuntu-latest, r: '4.2' } - - { os: windows-latest, r: '4.3' } - - { os: windows-latest, r: '4.2' } - - { os: macos-latest, r: '4.3' } - - { os: macos-latest, r: '4.2' } + - { os: windows-latest, r: '4.5' } + - { os: windows-latest, r: '4.4' } + - { os: macos-latest, r: '4.5' } + - { os: macos-latest, r: '4.4' } name: R ${{ matrix.config.r }} - ${{ matrix.config.os }} steps: diff --git a/DESCRIPTION b/DESCRIPTION index 76929b5..97e5171 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: splikit Title: A toolkit for analysing RNA splicing in scRNA-seq data -Version: 1.0.2 +Version: 1.0.3 Authors@R: person("Arsham", "Mikaeili Namini", , "arsham.mikaeilinamini@mail.mcgill.ca", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-9453-6951")) @@ -20,7 +20,8 @@ Imports: methods, stats, Rcpp, - RcppArmadillo + RcppArmadillo, + RcppEigen LinkingTo: Rcpp, RcppArmadillo, diff --git a/NAMESPACE b/NAMESPACE index 28cdb29..0ae24c3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ export(find_variable_genes) export(get_pseudo_correlation) export(get_rowVar) export(get_silhouette_mean) +export(load_toy_M1_M2_object) export(load_toy_SJ_object) export(make_eventdata_plus) export(make_gene_count) @@ -13,7 +14,12 @@ export(make_m1) export(make_m2) export(make_velo_count) import(Matrix) +import(Rcpp) import(data.table) +importClassesFrom(Matrix,dgCMatrix) +importClassesFrom(Matrix,dgTMatrix) +importClassesFrom(Matrix,dsCMatrix) +importClassesFrom(Matrix,dsTMatrix) importFrom(Matrix,Matrix) importFrom(Matrix,readMM) importFrom(Matrix,sparseMatrix) diff --git a/R/feature_selection.R b/R/feature_selection.R index 365dc0a..805a7e6 100644 --- a/R/feature_selection.R +++ b/R/feature_selection.R @@ -8,6 +8,16 @@ #' @param ... Additional arguments to be passed. #' #' @return A \code{data.table} containing the events and their corresponding sum deviance values. +#' +#' @examples +#' # loading the toy dataset +#' toy_obj <- load_toy_M1_M2_object() +#' +#' # getting HVE (high variable events) +#' HVE <- find_variable_events(toy_obj$m1, toy_obj$m2) +#' +#' # printing the results +#' print(HVE[order(-sum_deviance)]) #' @export find_variable_events <- function(m1_matrix, m2_matrix, min_row_sum = 50, verbose=TRUE, n_threads=1, ...) { @@ -95,19 +105,27 @@ find_variable_events <- function(m1_matrix, m2_matrix, min_row_sum = 50, verbose #' @return A \code{data.table} containing gene names (column \code{events}) and computed metrics. #' For the deviance method, this includes \code{sum_deviance} and \code{variance} columns. #' +#' @examples +#' library(Matrix) +#' # loading the toy dataset +#' toy_obj <- load_toy_M1_M2_object() +#' +#' # getting high variable genes +#' HVG_VST <- find_variable_genes(toy_obj$gene_expression, method = "vst") # vst method +#' HVG_DEV <- find_variable_genes(toy_obj$gene_expression, method = "sum_deviance") # sum_deviance method +#' +#' # printing the results +#' print(HVG_VST[order(-standardize_variance)]) +#' print(HVG_DEV[order(-sum_deviance)]) +#' +#' @useDynLib splikit, .registration = TRUE +#' @import Rcpp +#' @import Matrix +#' @importClassesFrom Matrix dgCMatrix dsCMatrix dgTMatrix dsTMatrix #' @export -find_variable_genes <- function(gene_expression_matrix, method = c("vst", "sum_deviance"), ...) { - - # Check required libraries - if (!requireNamespace("data.table", quietly = TRUE)) { - stop("The 'data.table' package is required but not installed.") - } - if (!requireNamespace("Rcpp", quietly = TRUE)) { - stop("The 'Rcpp' package is required but not installed.") - } - if (!requireNamespace("Matrix", quietly = TRUE)) { - stop("The 'Matrix' package is required but not installed.") - } +find_variable_genes <- function(gene_expression_matrix, method = "vst", ...) { + # addinng the vst method as the deafult + method <- match.arg(method, choices = c("vst", "sum_deviance")) # Verify that gene_expression_matrix is a sparse Matrix if (!inherits(gene_expression_matrix, "Matrix")) { @@ -155,7 +173,8 @@ find_variable_genes <- function(gene_expression_matrix, method = c("vst", "sum_d # Calculate deviances using the C++ function deviance_values <- tryCatch({ - calcNBDeviancesWithThetaEstimation(gene_expression_matrix_sub) + calcNBDeviancesWithThetaEstimation(as(gene_expression_matrix_sub, "dgCMatrix")) + }, error = function(e) { stop("Error in calcNBDeviancesWithThetaEstimation function: ", e$message) }) @@ -168,9 +187,9 @@ find_variable_genes <- function(gene_expression_matrix, method = c("vst", "sum_d # Compute row variance using the previously defined function row_var <- tryCatch({ - multigedi_get_row_variance(sparse_matrix = gene_expression_matrix) + splikit::get_rowVar(M = gene_expression_matrix) }, error = function(e) { - stop("Error in multigedi_get_row_variance: ", e$message) + stop("Error in splikit::get_rowVar: ", e$message) }) row_var_cpp_dt <- data.table::data.table(events = rownames(gene_expression_matrix), diff --git a/R/star_solo_processing.R b/R/star_solo_processing.R index c230d07..2aef72b 100644 --- a/R/star_solo_processing.R +++ b/R/star_solo_processing.R @@ -842,3 +842,13 @@ make_velo_count <- function(velocyto_dirs, sample_ids, whitelist_barcodes = NULL } } +#' Load the toy M1/M2 object +#' +#' Loads a toy object of M1 and M2 used for examples or testing. +#' +#' @return An R object from the toy_m1_m2_obj.rds file. +#' @export +load_toy_M1_M2_object <- function() { + file <- system.file("extdata", "toy_m1_m2_obj.rds", package = "splikit") + readRDS(file) +} \ No newline at end of file diff --git a/inst/extdata/toy_m1_m2_obj.rds b/inst/extdata/toy_m1_m2_obj.rds new file mode 100644 index 0000000..a6fa186 Binary files /dev/null and b/inst/extdata/toy_m1_m2_obj.rds differ diff --git a/man/find_variable_events.Rd b/man/find_variable_events.Rd index 3168449..f92bafe 100644 --- a/man/find_variable_events.Rd +++ b/man/find_variable_events.Rd @@ -32,3 +32,13 @@ A \code{data.table} containing the events and their corresponding sum deviance v \description{ Calculate the Sum Deviance for Inclusion and Exclusion Matrices } +\examples{ +# loading the toy dataset +toy_obj <- load_toy_M1_M2_object() + +# getting HVE (high variable events) + HVE <- find_variable_events(toy_obj$m1, toy_obj$m2) + + # printing the results + print(HVE[order(-sum_deviance)]) +} diff --git a/man/find_variable_genes.Rd b/man/find_variable_genes.Rd index 9217e8a..1be4272 100644 --- a/man/find_variable_genes.Rd +++ b/man/find_variable_genes.Rd @@ -4,11 +4,7 @@ \alias{find_variable_genes} \title{Find Variable Genes Using Variance or Deviance-Based Metrics} \usage{ -find_variable_genes( - gene_expression_matrix, - method = c("vst", "sum_deviance"), - ... -) +find_variable_genes(gene_expression_matrix, method = "vst", ...) } \arguments{ \item{gene_expression_matrix}{A sparse gene expression matrix (of class \code{Matrix}) with gene names as row names.} @@ -29,3 +25,17 @@ variance-stabilizing transformation (VST) or deviance-based modeling. The VST me approach to compute standardized variance, while the deviance-based method models gene variability across libraries using negative binomial deviances. } +\examples{ +library(Matrix) +# loading the toy dataset +toy_obj <- load_toy_M1_M2_object() + +# getting high variable genes +HVG_VST <- find_variable_genes(toy_obj$gene_expression, method = "vst") # vst method +HVG_DEV <- find_variable_genes(toy_obj$gene_expression, method = "sum_deviance") # sum_deviance method + +# printing the results +print(HVG_VST[order(-standardize_variance)]) +print(HVG_DEV[order(-sum_deviance)]) + +} diff --git a/man/load_toy_M1_M2_object.Rd b/man/load_toy_M1_M2_object.Rd new file mode 100644 index 0000000..f0c649e --- /dev/null +++ b/man/load_toy_M1_M2_object.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/star_solo_processing.R +\name{load_toy_M1_M2_object} +\alias{load_toy_M1_M2_object} +\title{Load the toy M1/M2 object} +\usage{ +load_toy_M1_M2_object() +} +\value{ +An R object from the toy_m1_m2_obj.rds file. +} +\description{ +Loads a toy object of M1 and M2 used for examples or testing. +} diff --git a/src/Makevars.linux b/src/Makevars.linux index 4952079..e69de29 100644 --- a/src/Makevars.linux +++ b/src/Makevars.linux @@ -1,4 +0,0 @@ -## src/Makevars (Linux) -CXX_STD = CXX14 -PKG_CXXFLAGS += -O3 -march=native -pipe -fopenmp -PKG_LIBS += -fopenmp $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) diff --git a/src/deviance_gene.cpp b/src/deviance_gene.cpp index 814feb2..ec571aa 100644 --- a/src/deviance_gene.cpp +++ b/src/deviance_gene.cpp @@ -1,25 +1,23 @@ -// calcNBDeviancesWithThetaEstimation.cpp -#define ARMA_64BIT_WORD #include // [[Rcpp::depends(RcppArmadillo)]] using namespace Rcpp; -// [[Rcpp::export]] +// [[Rcpp::export]] arma::vec calcNBDeviancesWithThetaEstimation(const arma::sp_mat& gene_expression) { + int n_rows = gene_expression.n_rows; int n_cols = gene_expression.n_cols; // number of cells arma::vec dev(n_rows, arma::fill::zeros); for (int i = 0; i < n_rows; i++){ - double sum_y = 0.0; + double sum_y = 0.0; double sum_y2 = 0.0; - for (int j = 0; j < n_cols; j++){ double y = gene_expression(i, j); - sum_y += y; // compute total counts - sum_y2 += y * y; // and total of squares for gene i. + sum_y += y; // compute total counts + sum_y2 += y * y; // and total of squares for gene i. } // If there are no counts, set deviance to 0. @@ -28,36 +26,27 @@ arma::vec calcNBDeviancesWithThetaEstimation(const arma::sp_mat& gene_expression continue; } + double mu_hat = sum_y / n_cols; // intercept-only mean + double variance = (sum_y2 / n_cols) - (mu_hat * mu_hat); // empirical variance - double mu_hat = sum_y / n_cols; // Fitted mean under the intercept-only model (MLE). - - // Estimate the variance as (mean of squares) - (square of the mean). - double variance = (sum_y2 / n_cols) - (mu_hat * mu_hat); - - // Estimate theta using: theta = mu_hat^2 / (variance - mu_hat) if variance > mu_hat. - double theta_est; - if (variance > mu_hat) { - theta_est = (mu_hat * mu_hat) / (variance - mu_hat); - } else { - // If variance is not greater than mean, approximate Poisson behavior. - theta_est = 1e12; // large number to approximate theta -> infinity - } + // Estimate NB dispersion: theta = mu^2/(var − mu) if var > mu, else (approx. infiniti) + double theta_est = (variance > mu_hat) + ? (mu_hat * mu_hat) / (variance - mu_hat) + : 1e12; double dev_row = 0.0; - // Second pass: compute deviance contribution from each cell. + // Second pass: deviance contribution from each cell for (int j = 0; j < n_cols; j++){ double y = gene_expression(i, j); double term = 0.0; - // Add y * log(y/mu_hat); define 0*log(0) as 0. if (y > 0) term = y * std::log(y / mu_hat); - // Subtract (y + theta_est)*log((y + theta_est)/(mu_hat + theta_est)). - term -= (y + theta_est) * std::log((y + theta_est) / (mu_hat + theta_est)); + term -= (y + theta_est) + * std::log((y + theta_est) / (mu_hat + theta_est)); - // Multiply by 2 as per the deviance definition. - dev_row += 2 * term; + dev_row += 2.0 * term; } dev[i] = dev_row;