diff --git a/Cargo.toml b/Cargo.toml index 5bd52a23b..343cf6f64 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,10 @@ rand_isaac = "0.2.0" ndarray-npy = { version = "0.5", default-features = false } [workspace] -members = ["linfa-clustering"] +members = [ + "linfa-clustering", + "linfa-reduction" +] [profile.release] opt-level = 3 diff --git a/linfa-clustering/Cargo.toml b/linfa-clustering/Cargo.toml index c9e903bf8..1e9bae844 100644 --- a/linfa-clustering/Cargo.toml +++ b/linfa-clustering/Cargo.toml @@ -16,7 +16,10 @@ categories = ["algorithms", "mathematics", "science"] ndarray = { version = "0.13" , features = ["rayon", "serde", "approx"]} ndarray-rand = "0.11" ndarray-stats = "0.3" +ndarray-linalg = { version = "0.12", features = ["openblas"] } +sprs = "0.7" serde = { version = "1", features = ["derive"] } +num-traits = "0.1.32" [dev-dependencies] rand_isaac = "0.2.0" diff --git a/linfa-clustering/examples/kmeans.rs b/linfa-clustering/examples/kmeans.rs index a9fb71e9c..dde643b64 100644 --- a/linfa-clustering/examples/kmeans.rs +++ b/linfa-clustering/examples/kmeans.rs @@ -37,3 +37,4 @@ fn main() { ) .expect("Failed to write .npy file"); } + diff --git a/linfa-clustering/src/lib.rs b/linfa-clustering/src/lib.rs index 9f262177f..d0ebc7dd7 100644 --- a/linfa-clustering/src/lib.rs +++ b/linfa-clustering/src/lib.rs @@ -18,6 +18,10 @@ //! Implementation choices, algorithmic details and a tutorial can be found [here](struct.KMeans.html). //! //! Check [here](https://github.com/LukeMathWalker/clustering-benchmarks) for extensive benchmarks against `scikit-learn`'s K-means implementation. + +extern crate ndarray; +extern crate ndarray_linalg; + mod dbscan; #[allow(clippy::new_ret_no_self)] mod k_means; diff --git a/linfa-reduction/Cargo.toml b/linfa-reduction/Cargo.toml new file mode 100644 index 000000000..fcce021a9 --- /dev/null +++ b/linfa-reduction/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "linfa-reduction" +version = "0.1.0" +authors = ["Lorenz Schmidt "] +description = "A collection of dimensionality reduction techniques" +edition = "2018" +license = "MIT/Apache-2.0" + +repository = "https://github.com/LukeMathWalker/linfa" +readme = "README.md" + +keywords = ["dimensionality reduction", "machine-learning", "linfa", "spectral", "unsupervised"] +categories = ["algorithms", "mathematics", "science"] + +[dependencies] +ndarray = { version = "0.13" , features = ["rayon", "serde", "approx"]} +ndarray-rand = "0.11" +ndarray-stats = "0.3" +ndarray-linalg = { version = "0.12", features = ["openblas"] } +sprs = "0.7" +serde = { version = "1", features = ["derive"] } +num-traits = "0.1.32" +hnsw = "0.6" +space = "0.10" + +[dev-dependencies] +rand_isaac = "0.2.0" +ndarray-npy = { version = "0.5", default-features = false } +criterion = "0.3" +serde_json = "1" +approx = "0.3" +linfa-clustering = { path = "../linfa-clustering" } diff --git a/linfa-reduction/examples/diffusion_map.rs b/linfa-reduction/examples/diffusion_map.rs new file mode 100644 index 000000000..5c474eb9b --- /dev/null +++ b/linfa-reduction/examples/diffusion_map.rs @@ -0,0 +1,34 @@ +use linfa_reduction::utils::generate_convoluted_rings; +use linfa_reduction::kernel::{IntoKernel, SparsePolynomialKernel}; +use ndarray_npy::write_npy; +use ndarray_rand::rand::SeedableRng; +use rand_isaac::Isaac64Rng; + +// A routine K-means task: build a synthetic dataset, fit the algorithm on it +// and save both training data and predictions to disk. +fn main() { + // Our random number generator, seeded for reproducibility + let mut rng = Isaac64Rng::seed_from_u64(42); + + // For each our expected centroids, generate `n` data points around it (a "blob") + let n = 3000; + + // generate three convoluted rings + let dataset = generate_convoluted_rings(&[(0.0, 3.0), (10.0, 13.0), (20.0, 23.0)], n, &mut rng); + + // generate sparse polynomial kernel with k = 14, c = 5 and d = 2 + let diffusion_map = SparsePolynomialKernel::new(&dataset, 14, 5.0, 2.0) + .reduce_fixed(4); + + // get embedding + let embedding = diffusion_map.embedding(); + + // Save to disk our dataset (and the cluster label assigned to each observation) + // We use the `npy` format for compatibility with NumPy + write_npy("diffusion_map_dataset.npy", dataset).expect("Failed to write .npy file"); + write_npy( + "diffusion_map_embedding.npy", + embedding + ) + .expect("Failed to write .npy file"); +} diff --git a/linfa-reduction/examples/pca.rs b/linfa-reduction/examples/pca.rs new file mode 100644 index 000000000..834ed6cd8 --- /dev/null +++ b/linfa-reduction/examples/pca.rs @@ -0,0 +1,34 @@ +use linfa_clustering::generate_blobs; +use linfa_reduction::PrincipalComponentAnalysis; +use ndarray::array; +use ndarray_npy::write_npy; +use ndarray_rand::rand::SeedableRng; +use rand_isaac::Isaac64Rng; + +// A routine K-means task: build a synthetic dataset, fit the algorithm on it +// and save both training data and predictions to disk. +fn main() { + // Our random number generator, seeded for reproducibility + let mut rng = Isaac64Rng::seed_from_u64(42); + + // For each our expected centroids, generate `n` data points around it (a "blob") + let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],]; + let n = 10; + let dataset = generate_blobs(n, &expected_centroids, &mut rng); + + dbg!(&dataset); + + let embedding = PrincipalComponentAnalysis::fit(dataset.clone(), 1) + .predict(&dataset); + + dbg!(&embedding); + + // Save to disk our dataset (and the cluster label assigned to each observation) + // We use the `npy` format for compatibility with NumPy + write_npy("pca_dataset.npy", dataset).expect("Failed to write .npy file"); + write_npy( + "pca_embedding.npy", + embedding + ) + .expect("Failed to write .npy file"); +} diff --git a/linfa-reduction/src/diffusion_map/algorithms.rs b/linfa-reduction/src/diffusion_map/algorithms.rs new file mode 100644 index 000000000..333c113f3 --- /dev/null +++ b/linfa-reduction/src/diffusion_map/algorithms.rs @@ -0,0 +1,113 @@ +use ndarray::{Array1, Array2}; +use ndarray_linalg::{TruncatedOrder, lobpcg::LobpcgResult, lobpcg, eigh::EighInto, lapack::UPLO, Scalar}; +use ndarray_rand::{rand_distr::Uniform, RandomExt}; +use num_traits::NumCast; + +use crate::Float; +use crate::kernel::{Kernel, IntoKernel}; +use super::hyperparameters::DiffusionMapHyperParams; + +pub struct DiffusionMap { + hyperparameters: DiffusionMapHyperParams, + embedding: Array2, + eigvals: Array1 +} + +impl DiffusionMap { + pub fn project( + hyperparameters: DiffusionMapHyperParams, + kernel: impl IntoKernel + ) -> Self { + // compute spectral embedding with diffusion map + let (embedding, eigvals) = compute_diffusion_map(kernel.into_kernel(), hyperparameters.steps(), 0.0, hyperparameters.embedding_size(), None); + + DiffusionMap { + hyperparameters, + embedding, + eigvals + } + } + + /// Return the hyperparameters used to train this spectral mode instance. + pub fn hyperparameters(&self) -> &DiffusionMapHyperParams { + &self.hyperparameters + } + + /// Estimate the number of clusters in this embedding (very crude for now) + pub fn estimate_clusters(&self) -> usize { + let mean = self.eigvals.sum() / NumCast::from(self.eigvals.len()).unwrap(); + self.eigvals.iter().filter(|x| *x > &mean).count() + 1 + } + + /// Return a copy of the eigenvalues + pub fn eigvals(&self) -> Array1 { + self.eigvals.clone() + } + + pub fn embedding(&self) -> Array2 { + self.embedding.clone() + } +} + +fn compute_diffusion_map(kernel: impl Kernel, steps: usize, alpha: f32, embedding_size: usize, guess: Option>) -> (Array2, Array1) { + assert!(embedding_size < kernel.size()); + + let d = kernel.sum() + .mapv(|x| x.recip()); + + let d2 = d.mapv(|x| x.powf(NumCast::from(0.5 + alpha).unwrap())); + + // use full eigenvalue decomposition for small problem sizes + let (vals, mut vecs) = if kernel.size() < 5 * embedding_size + 1 { + let mut matrix = kernel.mul_similarity(&Array2::from_diag(&d).view()); + matrix.gencolumns_mut().into_iter().zip(d.iter()).for_each(|(mut a,b)| a *= *b); + + let (vals, vecs) = matrix.eigh_into(UPLO::Lower).unwrap(); + let (vals, vecs) = (vals.slice_move(s![..; -1]), vecs.slice_move(s![.., ..; -1])); + ( + vals.slice_move(s![1..embedding_size+1]).mapv(|x| Scalar::from_real(x)), + vecs.slice_move(s![..,1..embedding_size+1]) + ) + } else { + // calculate truncated eigenvalue decomposition + let x = guess.unwrap_or( + Array2::random((kernel.size(), embedding_size + 1), Uniform::new(0.0f64, 1.0)) + .mapv(|x| NumCast::from(x).unwrap()) + ); + + let result = lobpcg::lobpcg(|y| { + let mut y = y.to_owned(); + y.genrows_mut().into_iter().zip(d2.iter()).for_each(|(mut a,b)| a *= *b); + let mut y = kernel.mul_similarity(&y.view()); + y.genrows_mut().into_iter().zip(d2.iter()).for_each(|(mut a,b)| a *= *b); + + y + }, x, |_| {}, None, 1e-15, 200, TruncatedOrder::Largest); + + let (vals, vecs) = match result { + LobpcgResult::Ok(vals, vecs, _) | LobpcgResult::Err(vals, vecs, _, _) => { (vals, vecs)}, + _ => panic!("Eigendecomposition failed!") + }; + + + // cut away first eigenvalue/eigenvector + ( + vals.slice_move(s![1..]), + vecs.slice_move(s![..,1..]) + ) + }; + + let d = d.mapv(|x| x.sqrt()); + + for (mut col, val) in vecs.genrows_mut().into_iter().zip(d.iter()) { + col *= *val; + } + + let steps = NumCast::from(steps).unwrap(); + for (mut vec, val) in vecs.gencolumns_mut().into_iter().zip(vals.iter()) { + vec *= val.powf(steps); + } + + (vecs, vals) +} + diff --git a/linfa-reduction/src/diffusion_map/hyperparameters.rs b/linfa-reduction/src/diffusion_map/hyperparameters.rs new file mode 100644 index 000000000..c35dfad30 --- /dev/null +++ b/linfa-reduction/src/diffusion_map/hyperparameters.rs @@ -0,0 +1,48 @@ +pub struct DiffusionMapHyperParams { + steps: usize, + embedding_size: usize, +} + +pub struct DiffusionMapHyperParamsBuilder { + steps: usize, + embedding_size: usize, +} + +impl DiffusionMapHyperParamsBuilder { + pub fn steps(mut self, steps: usize) -> Self { + self.steps = steps; + + self + } + + pub fn build(self) -> DiffusionMapHyperParams { + DiffusionMapHyperParams::build( + self.steps, + self.embedding_size, + ) + } +} + +impl DiffusionMapHyperParams { + pub fn new(embedding_size: usize) -> DiffusionMapHyperParamsBuilder { + DiffusionMapHyperParamsBuilder { + steps: 10, + embedding_size + } + } + + pub fn steps(&self) -> usize { + self.steps + } + + pub fn embedding_size(&self) -> usize { + self.embedding_size + } + + pub fn build(steps: usize, embedding_size: usize) -> Self { + DiffusionMapHyperParams { + steps, + embedding_size, + } + } +} diff --git a/linfa-reduction/src/diffusion_map/mod.rs b/linfa-reduction/src/diffusion_map/mod.rs new file mode 100644 index 000000000..2d0728a5d --- /dev/null +++ b/linfa-reduction/src/diffusion_map/mod.rs @@ -0,0 +1,5 @@ +mod algorithms; +mod hyperparameters; + +pub use algorithms::*; +pub use hyperparameters::*; diff --git a/linfa-reduction/src/kernel/gaussian.rs b/linfa-reduction/src/kernel/gaussian.rs new file mode 100644 index 000000000..20de6f321 --- /dev/null +++ b/linfa-reduction/src/kernel/gaussian.rs @@ -0,0 +1,59 @@ +use num_traits::NumCast; +use ndarray::{Array2, ArrayView1}; +use sprs::CsMat; +use std::iter::Sum; + +use crate::Float; +use crate::kernel::{IntoKernel, dense_from_fn, sparse_from_fn}; + +fn kernel(a: ArrayView1, b: ArrayView1, eps: A) -> A { + let distance = a.iter().zip(b.iter()).map(|(x,y)| (*x-*y)*(*x-*y)) + .sum::(); + + (-distance / eps).exp() +} + +pub struct GaussianKernel { + pub data: Array2 +} + +impl> GaussianKernel { + pub fn new(dataset: &Array2, eps: f32) -> Self { + let eps = NumCast::from(eps).unwrap(); + + GaussianKernel { + data: dense_from_fn(dataset, |a,b| kernel(a,b,eps)) + } + } +} + +impl IntoKernel for GaussianKernel { + type IntoKer = Array2; + + fn into_kernel(self) -> Self::IntoKer { + self.data + } +} + +pub struct SparseGaussianKernel { + similarity: CsMat +} + +impl SparseGaussianKernel { + pub fn new(dataset: &Array2, k: usize, eps: f32) -> Self { + let eps = NumCast::from(eps).unwrap(); + + SparseGaussianKernel { + similarity: sparse_from_fn(dataset, k, |a,b| kernel(a,b,eps)) + } + } +} + +impl IntoKernel for SparseGaussianKernel { + type IntoKer = CsMat; + + fn into_kernel(self) -> Self::IntoKer { + self.similarity + } +} + diff --git a/linfa-reduction/src/kernel/mod.rs b/linfa-reduction/src/kernel/mod.rs new file mode 100644 index 000000000..808cbdd03 --- /dev/null +++ b/linfa-reduction/src/kernel/mod.rs @@ -0,0 +1,140 @@ +pub mod sparse; +pub mod gaussian; +pub mod polynomial; + +pub use gaussian::{GaussianKernel, SparseGaussianKernel}; +pub use polynomial::{PolynomialKernel, SparsePolynomialKernel}; + +use ndarray::prelude::*; +use ndarray::Data; +use sprs::CsMat; + +use crate::Float; +use crate::diffusion_map::{DiffusionMapHyperParams, DiffusionMap}; + +/// Symmetric kernel function, can be sparse or dense +/// +/// Required functions are: +/// * `mul_normalized_similarity`: performs a matrix multiplication with `rhs`, which is a +/// normalized (sum of columns and rows equal one) symmetric similarity matrix +/// * `size`: returns the number of data-points +pub trait Kernel { + fn mul_similarity(&self, rhs: &ArrayView2) -> Array2; + fn sum(&self) -> Array1; + fn size(&self) -> usize; + +} + +impl, A: Float> Kernel for ArrayBase { + fn mul_similarity(&self, rhs: &ArrayView2) -> Array2 { + self.dot(rhs) + } + + fn sum(&self) -> Array1 { + self.sum_axis(Axis(1)) + } + + fn size(&self) -> usize { + assert_eq!(self.ncols(), self.nrows()); + + self.ncols() + } +} + +impl Kernel for CsMat { + fn mul_similarity(&self, rhs: &ArrayView2) -> Array2 { + self * rhs + } + + fn sum(&self) -> Array1 { + let mut sum = Array1::zeros(self.cols()); + + for (val, i) in self.iter() { + let (_, col) = i; + sum[col] += *val; + } + + sum + } + + fn size(&self) -> usize { + assert_eq!(self.cols(), self.rows()); + + self.cols() + } +} + +/// Converts an object into a kernel +pub trait IntoKernel { + type IntoKer: Kernel; + + fn into_kernel(self) -> Self::IntoKer; + + fn reduce_fixed(self, embedding_size: usize) -> DiffusionMap + where Self: Sized + { + let params = DiffusionMapHyperParams::new(embedding_size) + .steps(1) + .build(); + + DiffusionMap::project(params, self) + } +} + +pub fn dense_from_fn, ArrayView1) -> A>(dataset: &Array2, fnc: T) -> Array2 { + let n_observations = dataset.len_of(Axis(0)); + let mut similarity = Array2::eye(n_observations); + + for i in 0..n_observations { + for j in 0..n_observations { + let a = dataset.row(i); + let b = dataset.row(j); + + similarity[(i, j)] = fnc(a, b); + } + } + + similarity +} + +pub fn sparse_from_fn, ArrayView1) -> A>(dataset: &Array2, k: usize, fnc: T) -> CsMat { + let mut data = sparse::adjacency_matrix(dataset, k); + + for (i, mut vec) in data.outer_iterator_mut().enumerate() { + for (j, val) in vec.iter_mut() { + let a = dataset.row(i); + let b = dataset.row(j); + + *val = fnc(a, b); + } + } + + data +} + +#[cfg(test)] +mod tests { + use super::Kernel; + use sprs::CsMatBase; + use ndarray::{Array1, Array2}; + use ndarray_rand::{rand_distr::Uniform, RandomExt}; + + #[test] + fn test_sprs() { + let a: Array2 = Array2::random((10, 10), Uniform::new(0., 1.)); + let a = CsMatBase::csr_from_dense(a.view(), 1e-5); + + assert_eq!(a.size(), 10); + assert_eq!(a.apply_gram(Array2::eye(10)), a.to_dense()); + } + + #[test] + fn test_dense() { + let id: Array2 = Array2::eye(10); + + assert_eq!(Kernel::sum(&id), Array1::ones(10)); + assert_eq!(Kernel::sum(&id), Array1::ones(10)); + + assert_eq!(id.size(), 10); + } +} diff --git a/linfa-reduction/src/kernel/polynomial.rs b/linfa-reduction/src/kernel/polynomial.rs new file mode 100644 index 000000000..0052ef305 --- /dev/null +++ b/linfa-reduction/src/kernel/polynomial.rs @@ -0,0 +1,58 @@ +use num_traits::NumCast; +use ndarray::{Array2, ArrayView1}; +use sprs::CsMat; +use std::iter::Sum; + +use crate::Float; +use crate::kernel::{IntoKernel, dense_from_fn, sparse_from_fn}; + +fn kernel(a: ArrayView1, b: ArrayView1, c: A, d: A) -> A { + (a.dot(&b) + c).powf(d) +} + +pub struct PolynomialKernel { + pub data: Array2 +} + +impl> PolynomialKernel { + pub fn new(dataset: &Array2, c: f32, d: f32) -> Self { + let c = NumCast::from(c).unwrap(); + let d = NumCast::from(d).unwrap(); + + PolynomialKernel { + data: dense_from_fn(dataset, |a,b| kernel(a,b,c,d)) + } + } +} + +impl IntoKernel for PolynomialKernel { + type IntoKer = Array2; + + fn into_kernel(self) -> Self::IntoKer { + self.data + } +} + +pub struct SparsePolynomialKernel { + similarity: CsMat +} + +impl SparsePolynomialKernel { + pub fn new(dataset: &Array2, k: usize, c: A, d: A) -> Self { + let c = NumCast::from(c).unwrap(); + let d = NumCast::from(d).unwrap(); + + SparsePolynomialKernel { + similarity: sparse_from_fn(dataset, k, |a,b| kernel(a,b,c,d)) + } + } +} + +impl IntoKernel for SparsePolynomialKernel { + type IntoKer = CsMat; + + fn into_kernel(self) -> Self::IntoKer { + self.similarity + } +} + diff --git a/linfa-reduction/src/kernel/sparse.rs b/linfa-reduction/src/kernel/sparse.rs new file mode 100644 index 000000000..8a7dd2bf4 --- /dev/null +++ b/linfa-reduction/src/kernel/sparse.rs @@ -0,0 +1,95 @@ +use sprs::{CsMat, CsMatBase}; +use hnsw::{Searcher, HNSW, Params}; +use space::{Neighbor, MetricPoint}; +use ndarray::{Array2, Axis, ArrayView1}; + +use num_traits::NumCast; + +use crate::Float; + +/// Implementation of euclidean distance for ndarray +struct Euclidean<'a, A>(ArrayView1<'a, A>); + +impl MetricPoint for Euclidean<'_, A> { + fn distance(&self, rhs: &Self) -> u32 { + let val = self.0 + .iter() + .zip(rhs.0.iter()) + .map(|(&a, &b)| (a - b) * (a - b)) + .sum::() + .sqrt(); + + space::f32_metric(val.to_f32().unwrap()) + } +} + +/// Create sparse adjacency matrix from dense dataset +pub fn adjacency_matrix(dataset: &Array2, k: usize) -> CsMat { + let n_points = dataset.len_of(Axis(0)); + + // ensure that the number of neighbours is at least one and less than the total number of + // points + assert!(k < n_points); + assert!(k > 0); + + let params = Params::new() + .ef_construction(k); + + let mut searcher = Searcher::default(); + let mut hnsw: HNSW> = HNSW::new_params(params); + + // insert all rows as data points into HNSW graph + for feature in dataset.genrows().into_iter() { + hnsw.insert(Euclidean(feature), &mut searcher); + } + + // allocate buffer for k neighbours (plus the points itself) + let mut neighbours = vec![Neighbor::invalid(); k + 1]; + + // allocate buffer to initialize the sparse matrix later on + // * data: we have exact #points * k positive entries + // * indptr: has structure [0,k,2k,...,#points*k] + // * indices: filled with the nearest indices + let mut data = Vec::with_capacity(n_points * (k+1)); + let mut indptr = Vec::with_capacity(n_points + 1); + //let indptr = (0..n_points+1).map(|x| x * (k+1)).collect::>(); + let mut indices = Vec::with_capacity(n_points * (k+1)); + indptr.push(0); + + // find neighbours for each data point + let mut added = 0; + for (m, feature) in dataset.genrows().into_iter().enumerate() { + hnsw.nearest(&Euclidean(feature), 3 * k, &mut searcher, &mut neighbours); + + //dbg!(&neighbours); + + // sort by indices + neighbours.sort_unstable(); + + indices.push(m); + data.push(NumCast::from(1.0).unwrap()); + added += 1; + + // push each index into the indices array + for n in &neighbours { + if m != n.index { + indices.push(n.index); + data.push(NumCast::from(1.0).unwrap()); + added += 1; + } + } + + indptr.push(added); + } + + + // create CSR matrix from data, indptr and indices + let mat = CsMatBase::new((n_points, n_points), indptr, indices, data); + let mut mat = &mat + &mat.transpose_view(); + + // ensure that all values are one + let val: A = NumCast::from(1.0).unwrap(); + mat.map_inplace(|_| val); + + mat +} diff --git a/linfa-reduction/src/lib.rs b/linfa-reduction/src/lib.rs new file mode 100644 index 000000000..db5777375 --- /dev/null +++ b/linfa-reduction/src/lib.rs @@ -0,0 +1,26 @@ +#[macro_use] extern crate ndarray; + +pub mod kernel; +pub mod diffusion_map; +pub mod pca; +pub mod utils; + +pub use diffusion_map::{DiffusionMap, DiffusionMapHyperParams}; +pub use utils::to_gaussian_similarity; +pub use pca::PrincipalComponentAnalysis; + +use ndarray::NdFloat; +use ndarray_linalg::Lapack; +use num_traits::FromPrimitive; + +pub trait Float: + NdFloat + + Lapack + + Default + + Clone + + FromPrimitive +{ +} + +impl Float for f32 {} +impl Float for f64 {} diff --git a/linfa-reduction/src/pca/algorithms.rs b/linfa-reduction/src/pca/algorithms.rs new file mode 100644 index 000000000..c58948cb9 --- /dev/null +++ b/linfa-reduction/src/pca/algorithms.rs @@ -0,0 +1,58 @@ +///! Principal Component Analysis +/// +/// Reduce dimensionality with a linear projection using Singular Value Decomposition. The data is +/// centered before applying the SVD. This uses TruncatedSvd from ndarray-linalg package. + +use ndarray::{ArrayBase, Array1, Array2, Ix2, Axis, DataMut}; +use ndarray_linalg::{TruncatedSvd, TruncatedOrder}; + +/// Pincipal Component Analysis +pub struct PrincipalComponentAnalysis { + embedding: Array2, + explained_variance: Array1, + mean: Array1 +} + +impl PrincipalComponentAnalysis { + pub fn fit>( + mut dataset: ArrayBase, + embedding_size: usize + ) -> Self { + // calculate mean of data and subtract it + let mean = dataset.mean_axis(Axis(0)).unwrap(); + dataset -= &mean; + + // estimate Singular Value Decomposition + let result = TruncatedSvd::new(dataset.to_owned(), TruncatedOrder::Largest) + .decompose(embedding_size) + .unwrap(); + + // explained variance is the spectral distribution of the eigenvalues + let (_, sigma, v_t) = result.values_vectors(); + let explained_variance = sigma.mapv(|x| x*x / (dataset.len() as f64 - 1.0)); + + PrincipalComponentAnalysis { + embedding: v_t, + explained_variance, + mean + } + } + + /// Given a new data points project with fitted model + pub fn predict>( + &self, + dataset: &ArrayBase + ) -> Array2 { + (dataset - &self.mean).dot(&self.embedding.t()) + } + + /// Return the amount of explained variance per element + pub fn explained_variance(&self) -> Array1 { + self.explained_variance.clone() + } + + /// Return the normalized amount of explained variance per element + pub fn explained_variance_ratio(&self) -> Array1 { + &self.explained_variance / self.explained_variance.sum() + } +} diff --git a/linfa-reduction/src/pca/mod.rs b/linfa-reduction/src/pca/mod.rs new file mode 100644 index 000000000..60025f396 --- /dev/null +++ b/linfa-reduction/src/pca/mod.rs @@ -0,0 +1,3 @@ +mod algorithms; + +pub use algorithms::*; diff --git a/linfa-reduction/src/utils.rs b/linfa-reduction/src/utils.rs new file mode 100644 index 000000000..768f0d0f1 --- /dev/null +++ b/linfa-reduction/src/utils.rs @@ -0,0 +1,83 @@ +use ndarray::{Array2, Ix2, Data, Axis, ArrayBase}; +use ndarray_rand::rand::Rng; +use num_traits::float::FloatConst; + +/// Computes a similarity matrix with gaussian kernel and scaling parameter `eps` +/// +/// The generated matrix is a upper triangular matrix with dimension NxN (number of observations) and contains the similarity between all permutations of observations +/// similarity +pub fn to_gaussian_similarity( + observations: &ArrayBase, Ix2>, + eps: f64, +) -> Array2 { + let n_observations = observations.len_of(Axis(0)); + let mut similarity = Array2::eye(n_observations); + + for i in 0..n_observations { + for j in 0..n_observations { + let a = observations.row(i); + let b = observations.row(j); + + let distance = a.iter().zip(b.iter()).map(|(x,y)| (x-y).powf(2.0)) + .sum::(); + + similarity[(i,j)] = (-distance / eps).exp(); + } + } + + similarity +} +/// +/// Generates a three dimension swiss roll, centered at the origin with height `height` and +/// outwards speed `speed` +pub fn generate_swissroll( + height: f64, + speed: f64, + n_points: usize, + rng: &mut impl Rng, +) -> Array2 { + let mut roll: Array2 = Array2::zeros((n_points, 3)); + + for i in 0..n_points { + let z = rng.gen_range(0.0, height); + let phi: f64 = rng.gen_range(0.0, 10.0); + //let offset: f64 = rng.gen_range(-0.5, 0.5); + let offset = 0.0; + + let x = speed * phi * phi.cos() + offset; + let y = speed * phi * phi.sin() + offset; + + roll[(i, 0)] = x; + roll[(i, 1)] = y; + roll[(i, 2)] = z; + } + roll +} + +pub fn generate_convoluted_rings( + rings: &[(f64, f64)], + n_points: usize, + rng: &mut impl Rng +) -> Array2 { + let n_points = (n_points as f32 / rings.len() as f32).ceil() as usize; + let mut array = Array2::zeros((n_points * rings.len(), 3)); + + for (n, (start, end)) in rings.into_iter().enumerate() { + // inner circle + for i in 0..n_points { + let r: f64 = rng.gen_range(start, end); + let phi: f64 = rng.gen_range(0.0, f64::PI() * 2.0); + let theta: f64 = rng.gen_range(0.0, f64::PI() * 2.0); + + let x = theta.sin() * phi.cos() * r; + let y = theta.sin() * phi.sin() * r; + let z = theta.cos() * r; + + array[(n * n_points + i, 0)] = x; + array[(n * n_points + i, 1)] = y; + array[(n * n_points + i, 2)] = z; + } + } + + array +}