Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "single-statistics"
version = "0.7.0"
version = "0.8.1"
edition = "2024"
license-file = "LICENSE.md"
readme = "README.md"
Expand Down
105 changes: 0 additions & 105 deletions src/enrichment/aucell.rs
Original file line number Diff line number Diff line change
@@ -1,107 +1,2 @@
// Following the general implementation presented here, But adapted to nalgebra_sparse and multithreading: https://github.com/scverse/decoupler/blob/main/src/decoupler/mt/_aucell.py

use std::mem::offset_of;

use nalgebra_sparse::CsrMatrix;
use ndarray::Array2;
use single_utilities::traits::{FloatOps, FloatOpsTS};

fn validate_n_up<T>(n_genes: usize, n_up: Option<T>) -> anyhow::Result<usize>
where
T: FloatOps {
let n_up_value = match n_up {
Some(val) => {
let n_up_int = num_traits::Float::ceil(val).to_usize().unwrap();
n_up_int
},
None => {
let n_up_float = T::from(0.05).unwrap() * T::from(n_genes).unwrap();
let n_up_ceil = num_traits::Float::ceil(n_up_float);
let n_up_int = n_up_ceil.to_usize().unwrap();
n_up_int.clamp(2, n_genes)
},
};

if n_up_value <= 1 || n_up_value > n_genes {
return Err(anyhow::anyhow!(
"For n_genes={}, n_up={} must be between 1 and {}",
n_genes, n_up_value, n_genes
));
}
Ok(n_up_value)
}

fn get_gene_set(
connectivity: &[usize],
starts: &[usize],
offsets: &[usize],
source_idx: usize
) -> Vec<usize> {
let start = starts[source_idx];
let offset = offsets[source_idx];
connectivity[start..start + offset].to_vec()
}

fn rank_data<T>(values: &[T]) -> Vec<usize>
where
T: FloatOps
{
let mut indexed_values: Vec<(usize, T)> = values
.iter()
.enumerate()
.map(|(i, &v)| (i,v))
.collect();

indexed_values.sort_by(|a, b| {
b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)
});

let mut ranks = vec![0; values.len()];
for (rank, (original_idx, _)) in indexed_values.iter().enumerate() {
ranks[*original_idx] = rank + 1;
}

ranks
}



fn compute_chunk_size(n_cells: usize, n_genes: usize) -> usize {
let n_cores = rayon::current_num_threads();

let base_chunk_size = if n_genes > 20000 {
200
} else if n_genes > 10000 {
500
} else {
1000
};

let min_chunks = n_cores;
let max_chunk_size = (n_cells + min_chunks - 1) / min_chunks;
base_chunk_size.min(max_chunk_size).max(1)
}

pub(crate) fn aucell_compute<T>(
matrix: &CsrMatrix<T>,
connectivity: &[usize],
starts: &[usize],
offsets: &[usize],
n_up: Option<T>
) -> anyhow::Result<Array2<T>>
where
T: FloatOpsTS {
let n_cells = matrix.nrows();
let n_genes = matrix.ncols();
let n_sources = starts.len();

if connectivity.is_empty() || starts.is_empty() || offsets.is_empty() {
return Err(anyhow::anyhow!("Connectivity arrays cannot be empty!!"))
}

if starts.len() != offsets.len() {
return Err(anyhow::anyhow!("Starts and offsets must have the same length!"))
}

todo!()
}
19 changes: 19 additions & 0 deletions src/enrichment/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,22 @@
//! Gene set enrichment analysis methods for single-cell data.
//!
//! This module provides various approaches to gene set enrichment analysis, allowing you to
//! determine whether predefined sets of genes show statistically significant enrichment in
//! your single-cell data.
//!
//! ## Available Methods
//!
//! - **GSEA** (`gsea`): Gene Set Enrichment Analysis using ranking-based approaches
//! - **AUCell** (`aucell`): Area Under the Curve method for gene set activity scoring
//! - **ORA** (`ora`): Over-Representation Analysis using hypergeometric testing
//!
//! ## Quick Example
//!
//! ```rust,no_run
//! // Gene set enrichment analysis workflow would go here
//! // (Implementation depends on the specific modules)
//! ```

mod gsea;
mod aucell;
mod ora;
26 changes: 26 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,28 @@
//! # single-statistics
//!
//! A specialized Rust library for statistical analysis of single-cell data, part of the single-rust ecosystem.
//!
//! This crate provides robust statistical methods for biological analysis of single-cell data, focusing on
//! differential expression analysis, marker gene identification, and related statistical tests. It is optimized
//! for sparse single-cell matrices and provides both parametric and non-parametric statistical tests.
//!
//! ## Core Features
//!
//! - **Differential Expression Analysis**: T-tests, Mann-Whitney U tests, and other statistical methods
//! - **Multiple Testing Correction**: FDR, Bonferroni, and other correction methods
//! - **Effect Size Calculations**: Cohen's d and other effect size measures
//! - **Sparse Matrix Support**: Optimized for `CsrMatrix` from nalgebra-sparse
//!
//! ## Quick Start
//!
//! Use the `MatrixStatTests` trait to perform differential expression analysis on sparse matrices.
//! The library supports various statistical tests including t-tests and Mann-Whitney U tests,
//! with automatic multiple testing correction.
//!
//! ## Module Organization
//!
//! - **[`testing`]**: Statistical tests, hypothesis testing, and multiple testing correction
//! - **[`enrichment`]**: Gene set enrichment analysis methods (GSEA, ORA, AUCell)

pub mod testing;
pub mod enrichment;
24 changes: 21 additions & 3 deletions src/testing/correction/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,28 @@
//! Multiple testing correction methods for controlling false positives in differential expression analysis.
//!
//! When testing thousands of genes simultaneously (as is common in single-cell RNA-seq analysis),
//! the probability of false positives increases dramatically. These correction methods help control
//! either the Family-Wise Error Rate (FWER) or False Discovery Rate (FDR).
//!
//! ## Available Methods
//!
//! - **Bonferroni**: Conservative FWER control, multiplies p-values by number of tests
//! - **Benjamini-Hochberg**: FDR control, less conservative than Bonferroni
//! - **Benjamini-Yekutieli**: FDR control for dependent tests
//! - **Holm-Bonferroni**: Step-down FWER control, less conservative than Bonferroni
//! - **Storey's q-value**: Improved FDR estimation
//!
//! ## Choosing a Method
//!
//! - **For single-cell DE analysis**: Use Benjamini-Hochberg (most common)
//! - **For very strict control**: Use Bonferroni or Holm-Bonferroni
//! - **For dependent tests**: Use Benjamini-Yekutieli
//! - **For large datasets**: Consider Storey's q-value

use anyhow::{Result, anyhow};
use single_utilities::traits::FloatOps;
use std::cmp::Ordering;

/// Multiple testing correction methods to control for false positives
/// when performing many statistical tests simultaneously.

/// Apply Bonferroni correction to p-values
///
/// Bonferroni correction is a simple but conservative method that multiplies
Expand Down
61 changes: 61 additions & 0 deletions src/testing/inference/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,85 @@ pub mod parametric;

pub mod nonparametric;

/// Statistical testing methods for sparse matrices, particularly suited for single-cell data.
///
/// This trait extends sparse matrix types (like `CsrMatrix`) with statistical testing capabilities.
/// It provides methods for differential expression analysis and other statistical comparisons
/// optimized for single-cell RNA-seq data.
///
/// # Matrix Format
///
/// The expected matrix format is **genes × cells** (features × observations), where:
/// - Rows represent genes/features
/// - Columns represent cells/observations
/// - Values represent expression levels (normalized counts, log-transformed, etc.)
///

pub trait MatrixStatTests<T>
where
T: FloatOpsTS,
{
/// Perform t-tests comparing two groups of cells for all genes.
///
/// Runs Student's or Welch's t-test for each gene (row) in the matrix, comparing
/// expression levels between the specified groups of cells (columns).
///
/// # Arguments
///
/// * `group1_indices` - Column indices for the first group of cells
/// * `group2_indices` - Column indices for the second group of cells
/// * `test_type` - Type of t-test (`Student` or `Welch`)
///
/// # Returns
///
/// Vector of `TestResult` objects, one per gene, containing test statistics and p-values.
///

fn t_test(
&self,
group1_indices: &[usize],
group2_indices: &[usize],
test_type: TTestType,
) -> anyhow::Result<Vec<TestResult<f64>>>;

/// Perform Mann-Whitney U tests comparing two groups of cells for all genes.
///
/// Runs non-parametric Mann-Whitney U (Wilcoxon rank-sum) tests for each gene,
/// comparing the distributions between the specified groups.
///
/// # Arguments
///
/// * `group1_indices` - Column indices for the first group of cells
/// * `group2_indices` - Column indices for the second group of cells
/// * `alternative` - Type of alternative hypothesis (two-sided, less, greater)
///
/// # Returns
///
/// Vector of `TestResult` objects containing U statistics and p-values.
fn mann_whitney_test(
&self,
group1_indices: &[usize],
group2_indices: &[usize],
alternative: Alternative,
) -> anyhow::Result<Vec<TestResult<f64>>>;

/// Comprehensive differential expression analysis with multiple testing correction.
///
/// This is the main method for differential expression analysis. It performs the
/// specified statistical test on all genes and applies multiple testing correction
/// to control the false discovery rate.
///
/// # Arguments
///
/// * `group_ids` - Vector assigning each cell to a group (currently supports 2 groups)
/// * `test_method` - Statistical test to perform
///
/// # Returns
///
/// `MultipleTestResults` containing statistics, p-values, adjusted p-values, and
/// metadata for all genes.
///

fn differential_expression(
&self,
group_ids: &[usize],
Expand Down
65 changes: 65 additions & 0 deletions src/testing/inference/nonparametric.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
//! Non-parametric statistical tests for single-cell data analysis.
//!
//! This module implements non-parametric statistical tests that make fewer assumptions about
//! data distribution. These tests are particularly useful for single-cell data which often
//! exhibits non-normal distributions, high sparsity, and outliers.
//!
//! The primary test implemented is the Mann-Whitney U test (also known as the Wilcoxon
//! rank-sum test), which compares the distributions of two groups without assuming normality.

use std::{cmp::Ordering, f64};

use nalgebra_sparse::CsrMatrix;
Expand All @@ -13,6 +22,34 @@ struct TieInfo {
tie_correction: f64,
}

/// Perform Mann-Whitney U tests on all genes comparing two groups of cells.
///
/// This function efficiently computes Mann-Whitney U statistics for all genes in a sparse
/// matrix, comparing expression distributions between two groups of cells. The implementation
/// uses parallel processing for improved performance on large datasets.
///
/// # Arguments
///
/// * `matrix` - Sparse expression matrix (genes × cells)
/// * `group1_indices` - Column indices for the first group of cells
/// * `group2_indices` - Column indices for the second group of cells
/// * `alternative` - Type of alternative hypothesis (two-sided, less, greater)
///
/// # Returns
///
/// Vector of `TestResult` objects containing U statistics and p-values for each gene.
/// let group1 = vec![0, 1, 2]; // First group of cells
/// let group2 = vec![3, 4, 5]; // Second group of cells
///
/// let results = mann_whitney_matrix_groups(
/// &matrix,
/// &group1,
/// &group2,
/// Alternative::TwoSided
/// )?;
/// # Ok(())
/// # }
/// ```
pub fn mann_whitney_matrix_groups<T>(
matrix: &CsrMatrix<T>,
group1_indices: &[usize],
Expand Down Expand Up @@ -48,6 +85,34 @@ where
Ok(results)
}

/// Perform an optimized Mann-Whitney U test on two samples.
///
/// This function computes the Mann-Whitney U statistic and p-value for comparing two
/// independent samples. It handles ties correctly and supports different alternative
/// hypotheses.
///
/// # Arguments
///
/// * `x` - First sample
/// * `y` - Second sample
/// * `alternative` - Type of alternative hypothesis
///
/// # Returns
///
/// `TestResult` containing the U statistic and p-value.
///
/// # Example
///
/// ```rust
/// use single_statistics::testing::inference::nonparametric::mann_whitney_optimized;
/// use single_statistics::testing::Alternative;
///
/// let group1 = vec![1.0, 2.0, 3.0];
/// let group2 = vec![4.0, 5.0, 6.0];
/// let result = mann_whitney_optimized(&group1, &group2, Alternative::TwoSided);
///
/// println!("U statistic: {}, p-value: {}", result.statistic, result.p_value);
/// ```
pub fn mann_whitney_optimized(x: &[f64], y: &[f64], alternative: Alternative) -> TestResult<f64> {
let nx = x.len();
let ny = y.len();
Expand Down
Loading