SingleRust · ianfd · Sep 11, 2025 · Sep 4, 2025 · Sep 4, 2025 · Sep 11, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "single-statistics"
-version = "0.7.0"
+version = "0.8.1"
 edition = "2024"
 license-file = "LICENSE.md"
 readme = "README.md"

diff --git a/src/enrichment/aucell.rs b/src/enrichment/aucell.rs
@@ -1,107 +1,2 @@
 // Following the general implementation presented here, But adapted to nalgebra_sparse and multithreading: https://github.com/scverse/decoupler/blob/main/src/decoupler/mt/_aucell.py
 
-use std::mem::offset_of;
-
-use nalgebra_sparse::CsrMatrix;
-use ndarray::Array2;
-use single_utilities::traits::{FloatOps, FloatOpsTS};
-
-fn validate_n_up<T>(n_genes: usize, n_up: Option<T>) -> anyhow::Result<usize> 
-where 
-T: FloatOps {
-    let n_up_value = match n_up {
-        Some(val) => {
-            let n_up_int = num_traits::Float::ceil(val).to_usize().unwrap();
-            n_up_int
-        },
-        None => {
-            let n_up_float = T::from(0.05).unwrap() * T::from(n_genes).unwrap();
-            let n_up_ceil = num_traits::Float::ceil(n_up_float);
-            let n_up_int = n_up_ceil.to_usize().unwrap();
-            n_up_int.clamp(2, n_genes)
-        },
-    };
-
-    if n_up_value <= 1 || n_up_value > n_genes {
-        return Err(anyhow::anyhow!(
-            "For n_genes={}, n_up={} must be between 1 and {}",
-            n_genes, n_up_value, n_genes
-        ));
-    }
-    Ok(n_up_value)
-}
-
-fn get_gene_set(
-    connectivity: &[usize],
-    starts: &[usize],
-    offsets: &[usize],
-    source_idx: usize
-) -> Vec<usize> {
-    let start = starts[source_idx];
-    let offset = offsets[source_idx];
-    connectivity[start..start + offset].to_vec()
-}
-
-fn rank_data<T>(values: &[T]) -> Vec<usize> 
-where
-    T: FloatOps 
-    {
-        let mut indexed_values: Vec<(usize, T)> = values
-        .iter()
-        .enumerate()
-        .map(|(i, &v)| (i,v))
-        .collect();
-
-        indexed_values.sort_by(|a, b| {
-            b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)
-        });
-
-        let mut ranks = vec![0; values.len()];
-        for (rank, (original_idx, _)) in indexed_values.iter().enumerate() {
-            ranks[*original_idx] = rank + 1;
-        }
-
-        ranks
-    }
-
-
-
-fn compute_chunk_size(n_cells: usize, n_genes: usize) -> usize {
-    let n_cores = rayon::current_num_threads();
-
-    let base_chunk_size = if n_genes > 20000 {
-        200
-    } else if n_genes > 10000 {
-        500
-    } else {
-        1000
-    };
-
-    let min_chunks = n_cores;
-    let max_chunk_size = (n_cells + min_chunks - 1) / min_chunks;
-    base_chunk_size.min(max_chunk_size).max(1)
-}
-
-pub(crate) fn aucell_compute<T>(
-    matrix: &CsrMatrix<T>,
-    connectivity: &[usize],
-    starts: &[usize],
-    offsets: &[usize],
-    n_up: Option<T>
-) -> anyhow::Result<Array2<T>>
-where 
-    T: FloatOpsTS {
-        let n_cells = matrix.nrows();
-        let n_genes = matrix.ncols();
-        let n_sources = starts.len();
-
-        if connectivity.is_empty() || starts.is_empty() || offsets.is_empty() {
-            return Err(anyhow::anyhow!("Connectivity arrays cannot be empty!!"))
-        }
-
-        if starts.len() != offsets.len() {
-            return Err(anyhow::anyhow!("Starts and offsets must have the same length!"))
-        }
-
-        todo!()
-    }
diff --git a/src/enrichment/mod.rs b/src/enrichment/mod.rs
@@ -1,3 +1,22 @@
+//! Gene set enrichment analysis methods for single-cell data.
+//!
+//! This module provides various approaches to gene set enrichment analysis, allowing you to
+//! determine whether predefined sets of genes show statistically significant enrichment in
+//! your single-cell data.
+//!
+//! ## Available Methods
+//!
+//! - **GSEA** (`gsea`): Gene Set Enrichment Analysis using ranking-based approaches
+//! - **AUCell** (`aucell`): Area Under the Curve method for gene set activity scoring
+//! - **ORA** (`ora`): Over-Representation Analysis using hypergeometric testing
+//!
+//! ## Quick Example
+//!
+//! ```rust,no_run
+//! // Gene set enrichment analysis workflow would go here
+//! // (Implementation depends on the specific modules)
+//! ```
+
 mod gsea;
 mod aucell;
 mod ora;
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,2 +1,28 @@
+//! # single-statistics
+//!
+//! A specialized Rust library for statistical analysis of single-cell data, part of the single-rust ecosystem.
+//!
+//! This crate provides robust statistical methods for biological analysis of single-cell data, focusing on
+//! differential expression analysis, marker gene identification, and related statistical tests. It is optimized
+//! for sparse single-cell matrices and provides both parametric and non-parametric statistical tests.
+//!
+//! ## Core Features
+//!
+//! - **Differential Expression Analysis**: T-tests, Mann-Whitney U tests, and other statistical methods
+//! - **Multiple Testing Correction**: FDR, Bonferroni, and other correction methods
+//! - **Effect Size Calculations**: Cohen's d and other effect size measures
+//! - **Sparse Matrix Support**: Optimized for `CsrMatrix` from nalgebra-sparse
+//!
+//! ## Quick Start
+//!
+//! Use the `MatrixStatTests` trait to perform differential expression analysis on sparse matrices.
+//! The library supports various statistical tests including t-tests and Mann-Whitney U tests,
+//! with automatic multiple testing correction.
+//!
+//! ## Module Organization
+//!
+//! - **[`testing`]**: Statistical tests, hypothesis testing, and multiple testing correction
+//! - **[`enrichment`]**: Gene set enrichment analysis methods (GSEA, ORA, AUCell)
+
 pub mod testing;
 pub mod enrichment;
diff --git a/src/testing/correction/mod.rs b/src/testing/correction/mod.rs
@@ -1,10 +1,28 @@
+//! Multiple testing correction methods for controlling false positives in differential expression analysis.
+//!
+//! When testing thousands of genes simultaneously (as is common in single-cell RNA-seq analysis),
+//! the probability of false positives increases dramatically. These correction methods help control
+//! either the Family-Wise Error Rate (FWER) or False Discovery Rate (FDR).
+//!
+//! ## Available Methods
+//!
+//! - **Bonferroni**: Conservative FWER control, multiplies p-values by number of tests
+//! - **Benjamini-Hochberg**: FDR control, less conservative than Bonferroni
+//! - **Benjamini-Yekutieli**: FDR control for dependent tests
+//! - **Holm-Bonferroni**: Step-down FWER control, less conservative than Bonferroni
+//! - **Storey's q-value**: Improved FDR estimation
+//!
+//! ## Choosing a Method
+//!
+//! - **For single-cell DE analysis**: Use Benjamini-Hochberg (most common)
+//! - **For very strict control**: Use Bonferroni or Holm-Bonferroni
+//! - **For dependent tests**: Use Benjamini-Yekutieli
+//! - **For large datasets**: Consider Storey's q-value
+
 use anyhow::{Result, anyhow};
 use single_utilities::traits::FloatOps;
 use std::cmp::Ordering;
 
-/// Multiple testing correction methods to control for false positives
-/// when performing many statistical tests simultaneously.
-
 /// Apply Bonferroni correction to p-values
 ///
 /// Bonferroni correction is a simple but conservative method that multiplies

diff --git a/src/testing/inference/mod.rs b/src/testing/inference/mod.rs
@@ -11,24 +11,85 @@ pub mod parametric;
 
 pub mod nonparametric;
 
+/// Statistical testing methods for sparse matrices, particularly suited for single-cell data.
+///
+/// This trait extends sparse matrix types (like `CsrMatrix`) with statistical testing capabilities.
+/// It provides methods for differential expression analysis and other statistical comparisons
+/// optimized for single-cell RNA-seq data.
+///
+/// # Matrix Format
+///
+/// The expected matrix format is **genes × cells** (features × observations), where:
+/// - Rows represent genes/features
+/// - Columns represent cells/observations
+/// - Values represent expression levels (normalized counts, log-transformed, etc.)
+///
+
 pub trait MatrixStatTests<T>
 where
     T: FloatOpsTS,
 {
+    /// Perform t-tests comparing two groups of cells for all genes.
+    ///
+    /// Runs Student's or Welch's t-test for each gene (row) in the matrix, comparing
+    /// expression levels between the specified groups of cells (columns).
+    ///
+    /// # Arguments
+    ///
+    /// * `group1_indices` - Column indices for the first group of cells
+    /// * `group2_indices` - Column indices for the second group of cells  
+    /// * `test_type` - Type of t-test (`Student` or `Welch`)
+    ///
+    /// # Returns
+    ///
+    /// Vector of `TestResult` objects, one per gene, containing test statistics and p-values.
+    ///
+
     fn t_test(
         &self,
         group1_indices: &[usize],
         group2_indices: &[usize],
         test_type: TTestType,
     ) -> anyhow::Result<Vec<TestResult<f64>>>;
 
+    /// Perform Mann-Whitney U tests comparing two groups of cells for all genes.
+    ///
+    /// Runs non-parametric Mann-Whitney U (Wilcoxon rank-sum) tests for each gene,
+    /// comparing the distributions between the specified groups.
+    ///
+    /// # Arguments
+    ///
+    /// * `group1_indices` - Column indices for the first group of cells
+    /// * `group2_indices` - Column indices for the second group of cells
+    /// * `alternative` - Type of alternative hypothesis (two-sided, less, greater)
+    ///
+    /// # Returns
+    ///
+    /// Vector of `TestResult` objects containing U statistics and p-values.
     fn mann_whitney_test(
         &self,
         group1_indices: &[usize],
         group2_indices: &[usize],
         alternative: Alternative,
     ) -> anyhow::Result<Vec<TestResult<f64>>>;
 
+    /// Comprehensive differential expression analysis with multiple testing correction.
+    ///
+    /// This is the main method for differential expression analysis. It performs the
+    /// specified statistical test on all genes and applies multiple testing correction
+    /// to control the false discovery rate.
+    ///
+    /// # Arguments
+    ///
+    /// * `group_ids` - Vector assigning each cell to a group (currently supports 2 groups)
+    /// * `test_method` - Statistical test to perform
+    ///
+    /// # Returns
+    ///
+    /// `MultipleTestResults` containing statistics, p-values, adjusted p-values, and
+    /// metadata for all genes.
+    ///
+
     fn differential_expression(
         &self,
         group_ids: &[usize],

diff --git a/src/testing/inference/nonparametric.rs b/src/testing/inference/nonparametric.rs
@@ -1,3 +1,12 @@
+//! Non-parametric statistical tests for single-cell data analysis.
+//!
+//! This module implements non-parametric statistical tests that make fewer assumptions about
+//! data distribution. These tests are particularly useful for single-cell data which often
+//! exhibits non-normal distributions, high sparsity, and outliers.
+//!
+//! The primary test implemented is the Mann-Whitney U test (also known as the Wilcoxon 
+//! rank-sum test), which compares the distributions of two groups without assuming normality.
+
 use std::{cmp::Ordering, f64};
 
 use nalgebra_sparse::CsrMatrix;
@@ -13,6 +22,34 @@ struct TieInfo {
     tie_correction: f64,
 }
 
+/// Perform Mann-Whitney U tests on all genes comparing two groups of cells.
+///
+/// This function efficiently computes Mann-Whitney U statistics for all genes in a sparse
+/// matrix, comparing expression distributions between two groups of cells. The implementation
+/// uses parallel processing for improved performance on large datasets.
+///
+/// # Arguments
+///
+/// * `matrix` - Sparse expression matrix (genes × cells)
+/// * `group1_indices` - Column indices for the first group of cells
+/// * `group2_indices` - Column indices for the second group of cells
+/// * `alternative` - Type of alternative hypothesis (two-sided, less, greater)
+///
+/// # Returns
+///
+/// Vector of `TestResult` objects containing U statistics and p-values for each gene.
+/// let group1 = vec![0, 1, 2]; // First group of cells
+/// let group2 = vec![3, 4, 5]; // Second group of cells
+///
+/// let results = mann_whitney_matrix_groups(
+///     &matrix, 
+///     &group1, 
+///     &group2, 
+///     Alternative::TwoSided
+/// )?;
+/// # Ok(())
+/// # }
+/// ```
 pub fn mann_whitney_matrix_groups<T>(
     matrix: &CsrMatrix<T>,
     group1_indices: &[usize],
@@ -48,6 +85,34 @@ where
     Ok(results)
 }
 
+/// Perform an optimized Mann-Whitney U test on two samples.
+///
+/// This function computes the Mann-Whitney U statistic and p-value for comparing two
+/// independent samples. It handles ties correctly and supports different alternative
+/// hypotheses.
+///
+/// # Arguments
+///
+/// * `x` - First sample
+/// * `y` - Second sample
+/// * `alternative` - Type of alternative hypothesis
+///
+/// # Returns
+///
+/// `TestResult` containing the U statistic and p-value.
+///
+/// # Example
+///
+/// ```rust
+/// use single_statistics::testing::inference::nonparametric::mann_whitney_optimized;
+/// use single_statistics::testing::Alternative;
+///
+/// let group1 = vec![1.0, 2.0, 3.0];
+/// let group2 = vec![4.0, 5.0, 6.0];
+/// let result = mann_whitney_optimized(&group1, &group2, Alternative::TwoSided);
+/// 
+/// println!("U statistic: {}, p-value: {}", result.statistic, result.p_value);
+/// ```
 pub fn mann_whitney_optimized(x: &[f64], y: &[f64], alternative: Alternative) -> TestResult<f64> {
     let nx = x.len();
     let ny = y.len();