From 32602b51e18f9119ea634bc664de24ba042f2898 Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Thu, 5 Mar 2020 12:17:47 +0100 Subject: [PATCH 01/24] Add first draft of spectral clustering algorithm --- linfa-clustering/src/spectral/algorithms.rs | 36 +++++++++++++++++++ .../src/spectral/hyperparameters.rs | 15 ++++++++ linfa-clustering/src/spectral/mod.rs | 3 ++ 3 files changed, 54 insertions(+) create mode 100644 linfa-clustering/src/spectral/algorithms.rs create mode 100644 linfa-clustering/src/spectral/hyperparameters.rs create mode 100644 linfa-clustering/src/spectral/mod.rs diff --git a/linfa-clustering/src/spectral/algorithms.rs b/linfa-clustering/src/spectral/algorithms.rs new file mode 100644 index 000000000..055fc4fc3 --- /dev/null +++ b/linfa-clustering/src/spectral/algorithms.rs @@ -0,0 +1,36 @@ +pub struct SpectralClustering { + hyperparameters: HyperParams, +} + +impl SpectralClustering { + pub fn fit( + hyperparameters: HyperParams, + observations: &ArrayBase, Ix2>, + rng: &mut impl Rng + ) -> Self { + // compute spectral embedding with diffusion map + let embedding = compute_diffusion_map(observations); + // calculate centroids of this embedding + let kmeans = KMeans::fit(&embedding, &mut rng); + + SpectralClustering { + hyperparameters, + observations, + embedding, + kmeans + } + } + + pub fn predict(&self, observations: &ArrayBase, Ix2>) -> Array1 { + // choose nearest observations and its embeddings + let indices = choose_nearest(&self.observations, observations); + let embeddings = self.embeddings[indices]; + + self.kmeans.predict(embeddings) + } + + /// Return the hyperparameters used to train this spectral mode instance. + pub fn hyperparameters(&self) -> &HyperParams { + &self.hyperparams + } +} diff --git a/linfa-clustering/src/spectral/hyperparameters.rs b/linfa-clustering/src/spectral/hyperparameters.rs new file mode 100644 index 000000000..fa7b539a8 --- /dev/null +++ b/linfa-clustering/src/spectral/hyperparameters.rs @@ -0,0 +1,15 @@ +pub enum Kernel { + Cutoff(f32), + Gaussian(f32) +} + +pub enum Clustering { + KMeans +} + +pub struct HyperParams { + n_clusters: usize, + steps: usize, + kernel: Kernel, + clustering: Clustering, +} diff --git a/linfa-clustering/src/spectral/mod.rs b/linfa-clustering/src/spectral/mod.rs new file mode 100644 index 000000000..4f6742108 --- /dev/null +++ b/linfa-clustering/src/spectral/mod.rs @@ -0,0 +1,3 @@ +mod algorithm; + +pub use algorithm::*; From bc4913d591a3113c395455dded0b87bc340136fa Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Tue, 10 Mar 2020 18:52:25 +0100 Subject: [PATCH 02/24] Add LOBPCG eigensolver --- linfa-clustering/Cargo.toml | 2 + linfa-clustering/examples/kmeans.rs | 1 + linfa-clustering/src/lib.rs | 6 + linfa-clustering/src/spectral/algorithms.rs | 90 ++++++++--- .../src/spectral/hyperparameters.rs | 70 ++++++++- linfa-clustering/src/spectral/lobpcg.rs | 143 ++++++++++++++++++ linfa-clustering/src/spectral/mod.rs | 7 +- linfa-clustering/src/utils.rs | 28 +++- 8 files changed, 319 insertions(+), 28 deletions(-) create mode 100644 linfa-clustering/src/spectral/lobpcg.rs diff --git a/linfa-clustering/Cargo.toml b/linfa-clustering/Cargo.toml index c9e903bf8..65e416ec0 100644 --- a/linfa-clustering/Cargo.toml +++ b/linfa-clustering/Cargo.toml @@ -16,6 +16,8 @@ categories = ["algorithms", "mathematics", "science"] ndarray = { version = "0.13" , features = ["rayon", "serde", "approx"]} ndarray-rand = "0.11" ndarray-stats = "0.3" +ndarray-linalg = { version = "0.12", features = ["openblas"] } +sprs = "0.7" serde = { version = "1", features = ["derive"] } [dev-dependencies] diff --git a/linfa-clustering/examples/kmeans.rs b/linfa-clustering/examples/kmeans.rs index a9fb71e9c..dde643b64 100644 --- a/linfa-clustering/examples/kmeans.rs +++ b/linfa-clustering/examples/kmeans.rs @@ -37,3 +37,4 @@ fn main() { ) .expect("Failed to write .npy file"); } + diff --git a/linfa-clustering/src/lib.rs b/linfa-clustering/src/lib.rs index 9f262177f..cc2505571 100644 --- a/linfa-clustering/src/lib.rs +++ b/linfa-clustering/src/lib.rs @@ -18,11 +18,17 @@ //! Implementation choices, algorithmic details and a tutorial can be found [here](struct.KMeans.html). //! //! Check [here](https://github.com/LukeMathWalker/clustering-benchmarks) for extensive benchmarks against `scikit-learn`'s K-means implementation. + +#[macro_use] extern crate ndarray; +#[macro_use] extern crate ndarray_linalg; + mod dbscan; #[allow(clippy::new_ret_no_self)] mod k_means; +mod spectral; mod utils; pub use dbscan::*; pub use k_means::*; +pub use spectral::*; pub use utils::*; diff --git a/linfa-clustering/src/spectral/algorithms.rs b/linfa-clustering/src/spectral/algorithms.rs index 055fc4fc3..aafa2455d 100644 --- a/linfa-clustering/src/spectral/algorithms.rs +++ b/linfa-clustering/src/spectral/algorithms.rs @@ -1,36 +1,90 @@ +use ndarray::{ArrayBase, Data, Array1, Ix2, Axis, DataMut}; +use ndarray_rand::rand::Rng; +use ndarray_linalg::{eigh::Eigh, lapack::UPLO}; + +use crate::k_means::*; +use super::hyperparameters::SpectralClusteringHyperParams; + pub struct SpectralClustering { - hyperparameters: HyperParams, + hyperparameters: SpectralClusteringHyperParams, + indices: Array1 } impl SpectralClustering { - pub fn fit( - hyperparameters: HyperParams, - observations: &ArrayBase, Ix2>, - rng: &mut impl Rng + pub fn fit_predict( + hyperparameters: SpectralClusteringHyperParams, + similarity: ArrayBase, Ix2>, + mut rng: &mut impl Rng ) -> Self { // compute spectral embedding with diffusion map - let embedding = compute_diffusion_map(observations); + let embedding = compute_diffusion_map(similarity, hyperparameters.steps(), hyperparameters.embedding_size()); + dbg!(&embedding); + // calculate centroids of this embedding - let kmeans = KMeans::fit(&embedding, &mut rng); + let conf = KMeansHyperParams::new(hyperparameters.n_clusters()) + .build(); + + let kmeans = KMeans::fit(conf, &embedding, &mut rng); + let indices = kmeans.predict(&embedding); SpectralClustering { hyperparameters, - observations, - embedding, - kmeans + indices } } - pub fn predict(&self, observations: &ArrayBase, Ix2>) -> Array1 { - // choose nearest observations and its embeddings - let indices = choose_nearest(&self.observations, observations); - let embeddings = self.embeddings[indices]; + /// Return the hyperparameters used to train this spectral mode instance. + pub fn hyperparameters(&self) -> &SpectralClusteringHyperParams { + &self.hyperparameters + } + + /// Return the indices + pub fn indices(&self) -> Array1 { + self.indices.clone() + } +} + +fn compute_diffusion_map(mut similarity: ArrayBase, Ix2>, steps: usize, embedding_size: usize) -> ArrayBase, Ix2> { + // calculate sum of rows + let d = similarity.sum_axis(Axis(0)) + .mapv(|x| 1.0/x.sqrt()); - self.kmeans.predict(embeddings) + // ensure that matrix is symmetric + for (idx, elm) in similarity.indexed_iter_mut() { + let (a, b) = idx; + *elm *= d[a] * d[b]; } - /// Return the hyperparameters used to train this spectral mode instance. - pub fn hyperparameters(&self) -> &HyperParams { - &self.hyperparams + for val in similarity.iter_mut() { + if val.abs() < 1e-2 { + *val = 0.0; + } } + //dbg!(&similarity);*/ + + // calculate eigenvalue decomposition + let (vals, mut vecs) = similarity.eigh(UPLO::Upper).unwrap(); + + let n_irrelevant = vals.iter().filter(|x| (*x-1.0).abs() < 1e-2).count(); + let embedding_size = usize::min(similarity.len_of(Axis(0)) - n_irrelevant, embedding_size); + let (start, end) = (similarity.len_of(Axis(0)) - embedding_size - n_irrelevant, similarity.len_of(Axis(0)) - n_irrelevant); + + let d = d.mapv(|x| 1.0/x); + + for (idx, elm) in vecs.indexed_iter_mut() { + let (row, _) = idx; + *elm *= d[row]; + } + + // crop eigenvectors to wished embedding dimension + + let vals = vals.slice(s![start..end]); + let mut vecs = vecs.slice(s![start..end, ..]).t().into_owned(); + + for (mut vec, val) in vecs.gencolumns_mut().into_iter().zip(vals.iter()) { + vec *= val.powf(steps as f64); + } + + vecs } + diff --git a/linfa-clustering/src/spectral/hyperparameters.rs b/linfa-clustering/src/spectral/hyperparameters.rs index fa7b539a8..20e7920f2 100644 --- a/linfa-clustering/src/spectral/hyperparameters.rs +++ b/linfa-clustering/src/spectral/hyperparameters.rs @@ -1,15 +1,71 @@ -pub enum Kernel { - Cutoff(f32), - Gaussian(f32) -} - pub enum Clustering { KMeans } +pub struct SpectralClusteringHyperParams { + n_clusters: usize, + steps: usize, + embedding_size: usize, + clustering: Clustering, +} -pub struct HyperParams { +pub struct SpectralClusteringHyperParamsBuilder { n_clusters: usize, steps: usize, - kernel: Kernel, + embedding_size: usize, clustering: Clustering, } + +impl SpectralClusteringHyperParamsBuilder { + pub fn steps(mut self, steps: usize) -> Self { + self.steps = steps; + + self + } + + pub fn clustering(mut self, clustering: Clustering) -> Self { + self.clustering = clustering; + + self + } + + pub fn build(self) -> SpectralClusteringHyperParams { + SpectralClusteringHyperParams::build( + self.n_clusters, + self.steps, + self.embedding_size, + self.clustering + ) + } +} + +impl SpectralClusteringHyperParams { + pub fn new(n_clusters: usize, embedding_size: usize) -> SpectralClusteringHyperParamsBuilder { + SpectralClusteringHyperParamsBuilder { + steps: 10, + clustering: Clustering::KMeans, + n_clusters, + embedding_size + } + } + + pub fn steps(&self) -> usize { + self.steps + } + + pub fn n_clusters(&self) -> usize { + self.n_clusters + } + + pub fn embedding_size(&self) -> usize { + self.embedding_size + } + + pub fn build(n_clusters: usize, steps: usize, embedding_size: usize, clustering: Clustering) -> Self { + SpectralClusteringHyperParams { + steps, + n_clusters, + embedding_size, + clustering + } + } +} diff --git a/linfa-clustering/src/spectral/lobpcg.rs b/linfa-clustering/src/spectral/lobpcg.rs new file mode 100644 index 000000000..7f947dbf6 --- /dev/null +++ b/linfa-clustering/src/spectral/lobpcg.rs @@ -0,0 +1,143 @@ +use ndarray::prelude::*; +use ndarray::OwnedRepr; +use ndarray_linalg::cholesky::*; +use ndarray_linalg::eigh::*; +use ndarray_linalg::norm::*; +use sprs::CsMat; + +pub enum Order { + Largest, + Smallest +} + +fn sorted_eig(input: ArrayView, size: usize, order: Order) -> (Array1, Array2) { + assert_close_l2!(&input, &input.t(), 1e-12); + let (vals, vecs) = input.eigh(UPLO::Upper).unwrap(); + let n = input.len_of(Axis(0)); + + match order { + Order::Largest => (vals.slice_move(s![n-size..; -1]), vecs.slice_move(s![.., n-size..; -1])), + Order::Smallest => (vals.slice_move(s![..size]), vecs.slice_move(s![..size, ..])) + } +} + +fn apply_constraints( + mut V: ArrayViewMut, + fact_YY: CholeskyFactorized>, + Y: Array2 +) { + let gram_YV = Y.t().dot(&V); + + let U = gram_YV.genrows().into_iter() + .map(|x| fact_YY.solvec(&x).unwrap().to_vec()) + .flatten() + .collect::>(); + + let U = Array2::from_shape_vec((5, 5), U).unwrap(); + + V -= &(Y.dot(&U)); +} + +fn orthonormalize( + V: Array2 +) -> (Array2, Array2) { + let gram_VV = V.t().dot(&V); + let gram_VV_fac = gram_VV.factorizec(UPLO::Upper).unwrap(); + let inv_gram_VV = gram_VV_fac.invc().unwrap(); + + let V = V.dot(&inv_gram_VV); + (V, inv_gram_VV) +} + +pub fn lobpcg( + A: CsMat, + mut X: Array2, + M: Option>, + Y: Option>, + tol: f32, maxiter: usize, + order: Order +) -> (Array1, Array2) { + // the target matrix should be symmetric + assert!(sprs::is_symmetric(&A)); + assert_eq!(A.cols(), A.rows()); + + // the initital approximation should be maximal square + // n is the dimensionality of the problem + let (n, sizeX) = (X.rows(), X.cols()); + assert!(sizeX <= n); + assert_eq!(n, A.cols()); + + let sizeY = match Y { + Some(ref x) => x.cols(), + _ => 0 + }; + + if (n - sizeY) < 5 * sizeX { + panic!("Please use a different approach, the LOBPCG method only supports the calculation of a couple of eigenvectors!"); + } + + let maxiter = usize::min(n, maxiter); + + if let Some(Y) = Y { + let fact_YY = (&Y.t() * &Y).factorizec(UPLO::Upper).unwrap(); + apply_constraints(X.view_mut(), fact_YY, Y); + } + + let (X, _) = orthonormalize(X); + let AX = &A * &X; + let gram_XAX = &X.t() * &AX; + + let (lambda, eig_block) = sorted_eig(gram_XAX.view(), sizeX, Order::Largest); + + let X = &X * &eig_block; + let AX = &AX * &eig_block; + + let mut residual_norms = Vec::new(); + + for i in 0..maxiter { + let R = &AX - &(&X * &lambda); + let tmp = R.genrows().into_iter().map(|x| x.norm()).collect::>(); + residual_norms.push(tmp); + + } + + + (Array1::zeros(10), Array2::eye(10)) +} + +mod tests { + use super::sorted_eig; + use super::orthonormalize; + use super::Order; + use ndarray::prelude::*; + use ndarray_rand::RandomExt; + use ndarray_rand::rand_distr::Uniform; + + #[test] + fn test_sorted_eigen() { + let matrix = Array2::random((10, 10), Uniform::new(0., 10.)); + let matrix = matrix.t().dot(&matrix); + + // return all eigenvectors with largest first + let (vals, vecs) = sorted_eig(matrix.view(), 10, Order::Largest); + + // calculate V * A * V' and compare to original matrix + let diag = Array2::from_diag(&vals); + let rec = (vecs.dot(&diag)).dot(&vecs.t()); + + assert_close_l2!(&matrix, &rec, 1e-5); + } + + #[test] + fn test_orthonormalize() { + let matrix = Array2::random((10, 10), Uniform::new(0., 10.)); + let matrix = matrix.t().dot(&matrix); + dbg!(&matrix); + + let (normalized, _) = orthonormalize(matrix.clone()); + + assert_close_l2!(&normalized.dot(&normalized.t()), &Array2::eye(10), 1e-5); + + } + +} diff --git a/linfa-clustering/src/spectral/mod.rs b/linfa-clustering/src/spectral/mod.rs index 4f6742108..3d48ff32a 100644 --- a/linfa-clustering/src/spectral/mod.rs +++ b/linfa-clustering/src/spectral/mod.rs @@ -1,3 +1,6 @@ -mod algorithm; +mod algorithms; +mod hyperparameters; +mod lobpcg; -pub use algorithm::*; +pub use algorithms::*; +pub use hyperparameters::*; diff --git a/linfa-clustering/src/utils.rs b/linfa-clustering/src/utils.rs index 4ca7f980a..b25ca685e 100644 --- a/linfa-clustering/src/utils.rs +++ b/linfa-clustering/src/utils.rs @@ -1,4 +1,4 @@ -use ndarray::{s, Array, Array2, ArrayBase, Data, Ix1, Ix2}; +use ndarray::{s, Array, Array2, ArrayBase, Data, Ix1, Ix2, Axis}; use ndarray_rand::rand::Rng; use ndarray_rand::rand_distr::StandardNormal; use ndarray_rand::RandomExt; @@ -43,3 +43,29 @@ pub fn generate_blob( let origin_blob: Array2 = Array::random_using(shape, StandardNormal, rng); origin_blob + blob_centroid } + +/// Computes a similarity matrix with gaussian kernel and scaling parameter `eps` +/// +/// The generated matrix is a upper triangular matrix with dimension NxN (number of observations) and contains the similarity between all permutations of observations +/// similarity +pub fn to_gaussian_similarity( + observations: &ArrayBase, Ix2>, + eps: f64, +) -> Array2 { + let n_observations = observations.len_of(Axis(0)); + let mut similarity = Array2::eye(n_observations); + + for i in 0..n_observations { + for j in 0..n_observations { + let a = observations.row(i); + let b = observations.row(j); + + let distance = a.iter().zip(b.iter()).map(|(x,y)| (x-y).powf(2.0)) + .sum::(); + + similarity[(i,j)] = (-distance / eps).exp(); + } + } + + similarity +} From 708736216563f0067b662e65598c060158d0e15f Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Wed, 11 Mar 2020 15:04:21 +0100 Subject: [PATCH 03/24] Add more code to LOBPCG --- linfa-clustering/src/spectral/lobpcg.rs | 131 ++++++++++++++++++++---- 1 file changed, 112 insertions(+), 19 deletions(-) diff --git a/linfa-clustering/src/spectral/lobpcg.rs b/linfa-clustering/src/spectral/lobpcg.rs index 7f947dbf6..143e8ecee 100644 --- a/linfa-clustering/src/spectral/lobpcg.rs +++ b/linfa-clustering/src/spectral/lobpcg.rs @@ -1,8 +1,11 @@ use ndarray::prelude::*; use ndarray::OwnedRepr; use ndarray_linalg::cholesky::*; +use ndarray_linalg::triangular::*; +use ndarray_linalg::qr::*; use ndarray_linalg::eigh::*; use ndarray_linalg::norm::*; +use crate::ndarray::linalg::Dot; use sprs::CsMat; pub enum Order { @@ -21,10 +24,26 @@ fn sorted_eig(input: ArrayView, size: usize, order: Order) -> (Array1< } } +fn ndarray_mask(matrix: ArrayView, mask: &[bool]) -> Array2 { + let (rows, cols) = (matrix.rows(), matrix.cols()); + + assert!(mask.len() == cols); + + let n_positive = mask.iter().filter(|x| **x).count(); + + let matrix = matrix.gencolumns().into_iter().zip(mask.iter()) + .filter(|(_,x)| **x) + .map(|(x,_)| x.to_vec()) + .flatten() + .collect::>(); + + Array2::from_shape_vec((n_positive, rows), matrix).unwrap().reversed_axes() +} + fn apply_constraints( mut V: ArrayViewMut, - fact_YY: CholeskyFactorized>, - Y: Array2 + fact_YY: &CholeskyFactorized>, + Y: ArrayView ) { let gram_YV = Y.t().dot(&V); @@ -43,10 +62,14 @@ fn orthonormalize( ) -> (Array2, Array2) { let gram_VV = V.t().dot(&V); let gram_VV_fac = gram_VV.factorizec(UPLO::Upper).unwrap(); - let inv_gram_VV = gram_VV_fac.invc().unwrap(); + let gram_VV_fac = gram_VV_fac.into_lower(); + + assert_close_l2!(&gram_VV, &gram_VV_fac.dot(&gram_VV_fac.t()), 1e-5); + + let V_t = V.reversed_axes(); + let U = gram_VV_fac.solve_triangular(UPLO::Lower, Diag::NonUnit, &V_t).unwrap(); - let V = V.dot(&inv_gram_VV); - (V, inv_gram_VV) + (U, gram_VV_fac) } pub fn lobpcg( @@ -78,27 +101,83 @@ pub fn lobpcg( let maxiter = usize::min(n, maxiter); - if let Some(Y) = Y { - let fact_YY = (&Y.t() * &Y).factorizec(UPLO::Upper).unwrap(); - apply_constraints(X.view_mut(), fact_YY, Y); + let mut fact_YY = None; + if let Some(ref Y) = &Y { + let fact_YY_tmp = Y.t().dot(Y).factorizec(UPLO::Upper).unwrap(); + apply_constraints(X.view_mut(), &fact_YY_tmp, Y.view()); + fact_YY = Some(fact_YY_tmp); } + // orthonormalize the initial guess and calculate matrices AX and XAX let (X, _) = orthonormalize(X); - let AX = &A * &X; - let gram_XAX = &X.t() * &AX; + let AX = A.dot(&X); + let gram_XAX = X.t().dot(&AX); - let (lambda, eig_block) = sorted_eig(gram_XAX.view(), sizeX, Order::Largest); + // perform eigenvalue decomposition on XAX + let (lambda, eig_block) = sorted_eig(gram_XAX.view(), sizeX, order); - let X = &X * &eig_block; - let AX = &AX * &eig_block; + // rotate X and AX with eigenvectors + let X = X.dot(&eig_block); + let AX = AX.dot(&eig_block); + let mut activemask = vec![true; sizeX]; let mut residual_norms = Vec::new(); + let mut previous_block_size = sizeX; + + let mut ident: Array2 = Array2::eye(sizeX); + let ident0: Array2 = Array2::eye(sizeX); for i in 0..maxiter { - let R = &AX - &(&X * &lambda); + // calculate residual + let R = &AX - &X.dot(&lambda); + + // calculate L2 norm for every eigenvector let tmp = R.genrows().into_iter().map(|x| x.norm()).collect::>(); + activemask = tmp.iter().zip(activemask.iter()).map(|(x, a)| *x > tol && *a).collect(); residual_norms.push(tmp); + let current_block_size = activemask.iter().filter(|x| **x).count(); + if current_block_size != previous_block_size { + previous_block_size = current_block_size; + ident = Array2::eye(current_block_size); + } + + if current_block_size == 0 { + break; + } + + let mut active_block_R = ndarray_mask(R.view(), &activemask); + if let Some(ref M) = M { + active_block_R = M.dot(&active_block_R); + } + if let (Some(ref Y), Some(ref YY)) = (&Y, &fact_YY) { + apply_constraints(active_block_R.view_mut(), YY, Y.view()); + } + + let (R,_) = orthonormalize(R); + let AR = A.dot(&R); + + // perform the Rayleigh Ritz procedure + // compute symmetric gram matrices + let xaw = X.t().dot(&AR); + let waw = R.t().dot(&AR); + let xbw = X.t().dot(&R); + + let (gramA, gramB) = if i > 0 { + (CsMat::eye(5), CsMat::eye(5)) + } else { + ( + /*sprs::bmat(&[[Some(CsMat::diag(&lambda)), Some(xaw)], + [Some(xaw.t()), Some(waw)]]), + sprs::bmat(&[[Some(ident0), Some(xbw)], + [Some(xbw.t()), Some(ident)]])*/ + CsMat::eye(5), + sprs::bmat(&[[Some(xbw.view())]]) + ) + }; + + //let active_block_R = R.slice(s![:, + } @@ -108,8 +187,10 @@ pub fn lobpcg( mod tests { use super::sorted_eig; use super::orthonormalize; + use super::ndarray_mask; use super::Order; use ndarray::prelude::*; + use ndarray_linalg::qr::*; use ndarray_rand::RandomExt; use ndarray_rand::rand_distr::Uniform; @@ -128,15 +209,27 @@ mod tests { assert_close_l2!(&matrix, &rec, 1e-5); } + #[test] + fn test_masking() { + let matrix = Array2::random((10, 5), Uniform::new(0., 10.)); + let masked_matrix = ndarray_mask(matrix.view(), &[true, true, false, true, false]); + assert_close_l2!(&masked_matrix.slice(s![.., 2]), &matrix.slice(s![.., 3]), 1e-12); + } + #[test] fn test_orthonormalize() { - let matrix = Array2::random((10, 10), Uniform::new(0., 10.)); - let matrix = matrix.t().dot(&matrix); - dbg!(&matrix); + let matrix = Array2::random((10, 10), Uniform::new(-10., 10.)); + + let (n, l) = orthonormalize(matrix.clone()); + + // check for orthogonality + let identity = n.dot(&n.t()); + assert_close_l2!(&identity, &Array2::eye(10), 1e-4); - let (normalized, _) = orthonormalize(matrix.clone()); + // compare returned factorization with QR decomposition + let (q, mut r) = matrix.qr().unwrap(); + assert_close_l2!(&r.mapv(|x| x.abs()) , &l.t().mapv(|x| x.abs()), 1e-5); - assert_close_l2!(&normalized.dot(&normalized.t()), &Array2::eye(10), 1e-5); } From 2da63b4585116250f374e407bb35a97b06ead61e Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Fri, 13 Mar 2020 11:09:59 +0100 Subject: [PATCH 04/24] Complete port of LOBPCG --- linfa-clustering/Cargo.toml | 3 +- linfa-clustering/src/spectral/lobpcg.rs | 109 ++++++++++++++++++------ 2 files changed, 87 insertions(+), 25 deletions(-) diff --git a/linfa-clustering/Cargo.toml b/linfa-clustering/Cargo.toml index 65e416ec0..8aa375696 100644 --- a/linfa-clustering/Cargo.toml +++ b/linfa-clustering/Cargo.toml @@ -16,7 +16,8 @@ categories = ["algorithms", "mathematics", "science"] ndarray = { version = "0.13" , features = ["rayon", "serde", "approx"]} ndarray-rand = "0.11" ndarray-stats = "0.3" -ndarray-linalg = { version = "0.12", features = ["openblas"] } +#ndarray-linalg = { version = "0.12", features = ["openblas"] } +ndarray-linalg = { path = "../../ndarray-linalg", features = ["openblas"] } sprs = "0.7" serde = { version = "1", features = ["derive"] } diff --git a/linfa-clustering/src/spectral/lobpcg.rs b/linfa-clustering/src/spectral/lobpcg.rs index 143e8ecee..4176ba39f 100644 --- a/linfa-clustering/src/spectral/lobpcg.rs +++ b/linfa-clustering/src/spectral/lobpcg.rs @@ -13,9 +13,13 @@ pub enum Order { Smallest } -fn sorted_eig(input: ArrayView, size: usize, order: Order) -> (Array1, Array2) { +fn sorted_eig(input: ArrayView, stiff: Option>, size: usize, order: &Order) -> (Array1, Array2) { assert_close_l2!(&input, &input.t(), 1e-12); - let (vals, vecs) = input.eigh(UPLO::Upper).unwrap(); + let (vals, vecs) = match stiff { + Some(x) => (input, x).eigh(UPLO::Upper).map(|x| (x.0, (x.1).0)).unwrap(), + _ => input.eigh(UPLO::Upper).unwrap() + }; + let n = input.len_of(Axis(0)); match order { @@ -99,7 +103,7 @@ pub fn lobpcg( panic!("Please use a different approach, the LOBPCG method only supports the calculation of a couple of eigenvectors!"); } - let maxiter = usize::min(n, maxiter); + let mut iter = usize::min(n, maxiter); let mut fact_YY = None; if let Some(ref Y) = &Y { @@ -114,11 +118,11 @@ pub fn lobpcg( let gram_XAX = X.t().dot(&AX); // perform eigenvalue decomposition on XAX - let (lambda, eig_block) = sorted_eig(gram_XAX.view(), sizeX, order); + let (mut lambda, mut eig_block) = sorted_eig(gram_XAX.view(), None, sizeX, &order); // rotate X and AX with eigenvectors - let X = X.dot(&eig_block); - let AX = AX.dot(&eig_block); + let mut X = X.dot(&eig_block); + let mut AX = AX.dot(&eig_block); let mut activemask = vec![true; sizeX]; let mut residual_norms = Vec::new(); @@ -127,14 +131,16 @@ pub fn lobpcg( let mut ident: Array2 = Array2::eye(sizeX); let ident0: Array2 = Array2::eye(sizeX); - for i in 0..maxiter { + let mut ap: Option<(Array2, Array2)> = None; + + let final_norm = loop { // calculate residual let R = &AX - &X.dot(&lambda); // calculate L2 norm for every eigenvector let tmp = R.genrows().into_iter().map(|x| x.norm()).collect::>(); activemask = tmp.iter().zip(activemask.iter()).map(|(x, a)| *x > tol && *a).collect(); - residual_norms.push(tmp); + residual_norms.push(tmp.clone()); let current_block_size = activemask.iter().filter(|x| **x).count(); if current_block_size != previous_block_size { @@ -142,8 +148,8 @@ pub fn lobpcg( ident = Array2::eye(current_block_size); } - if current_block_size == 0 { - break; + if current_block_size == 0 || iter == 0 { + break tmp; } let mut active_block_R = ndarray_mask(R.view(), &activemask); @@ -161,27 +167,82 @@ pub fn lobpcg( // compute symmetric gram matrices let xaw = X.t().dot(&AR); let waw = R.t().dot(&AR); - let xbw = X.t().dot(&R); + let xw = X.t().dot(&R); - let (gramA, gramB) = if i > 0 { - (CsMat::eye(5), CsMat::eye(5)) + let (gramA, gramB): (Array2, Array2) = if let Some((ref P, ref AP)) = ap { + let active_P = ndarray_mask(P.view(), &activemask); + let active_AP = ndarray_mask(AP.view(), &activemask); + let (active_P, R) = orthonormalize(active_P); + let active_AP = R.solve_triangular(UPLO::Lower, Diag::NonUnit, &active_AP).unwrap(); + + let xap = X.t().dot(&active_AP); + let wap = R.t().dot(&active_AP); + let pap = active_P.t().dot(&active_AP); + let xp = X.t().dot(&active_P); + let wp = R.t().dot(&active_P); + + ( + stack![Axis(0), + stack![Axis(1), Array2::from_diag(&lambda), xaw, xap], + stack![Axis(1), xaw.t(), waw, wap], + stack![Axis(1), xap.t(), wap.t(), pap] + ], + + stack![Axis(0), + stack![Axis(1), ident0, xw, xp], + stack![Axis(1), xw.t(), ident, wp], + stack![Axis(1), xp.t(), wp.t(), ident] + ] + ) } else { ( - /*sprs::bmat(&[[Some(CsMat::diag(&lambda)), Some(xaw)], - [Some(xaw.t()), Some(waw)]]), - sprs::bmat(&[[Some(ident0), Some(xbw)], - [Some(xbw.t()), Some(ident)]])*/ - CsMat::eye(5), - sprs::bmat(&[[Some(xbw.view())]]) + stack![Axis(0), + stack![Axis(1), Array2::from_diag(&lambda), xaw], + stack![Axis(1), xaw.t(), waw] + ], + stack![Axis(0), + stack![Axis(1), ident0, xw], + stack![Axis(1), xw.t(), ident] + ] ) }; - //let active_block_R = R.slice(s![:, + let (new_lambda, eig_vecs) = sorted_eig(gramA.view(), Some(gramB.view()), sizeX, &order); + lambda = new_lambda; - } - + let (pp, app, eig_X) = if let Some((ref P, ref AP)) = ap { + let active_P = ndarray_mask(P.view(), &activemask); + let active_AP = ndarray_mask(AP.view(), &activemask); + + let eig_X = eig_vecs.slice(s![..sizeX, ..]); + let eig_R = eig_vecs.slice(s![sizeX..sizeX+current_block_size, ..]); + let eig_P = eig_vecs.slice(s![sizeX+current_block_size.., ..]); + + let pp = R.dot(&eig_R) + P.dot(&eig_P); + let app = AR.dot(&eig_R) + AP.dot(&eig_P); - (Array1::zeros(10), Array2::eye(10)) + (pp, app, eig_X) + } else { + let eig_X = eig_vecs.slice(s![..sizeX, ..]); + let eig_R = eig_vecs.slice(s![sizeX.., ..]); + + let pp = R.dot(&eig_R); + let app = AR.dot(&eig_R); + + (pp, app, eig_X) + }; + + X = X.dot(&eig_X) + &pp; + AX = AX.dot(&eig_X) + &app; + + ap = Some((pp, app)); + + iter -= 1; + }; + + dbg!(&final_norm); + + (lambda, X) } mod tests { @@ -200,7 +261,7 @@ mod tests { let matrix = matrix.t().dot(&matrix); // return all eigenvectors with largest first - let (vals, vecs) = sorted_eig(matrix.view(), 10, Order::Largest); + let (vals, vecs) = sorted_eig(matrix.view(), None, 10, &Order::Largest); // calculate V * A * V' and compare to original matrix let diag = Array2::from_diag(&vals); From df713bdc987bcd5f1ea7ef925e021f5f1a61c4af Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Fri, 13 Mar 2020 13:35:33 +0100 Subject: [PATCH 05/24] Start debugging of LOBPCG --- linfa-clustering/src/spectral/lobpcg.rs | 90 +++++++++++++++++++------ 1 file changed, 69 insertions(+), 21 deletions(-) diff --git a/linfa-clustering/src/spectral/lobpcg.rs b/linfa-clustering/src/spectral/lobpcg.rs index 4176ba39f..e5103ba72 100644 --- a/linfa-clustering/src/spectral/lobpcg.rs +++ b/linfa-clustering/src/spectral/lobpcg.rs @@ -14,7 +14,7 @@ pub enum Order { } fn sorted_eig(input: ArrayView, stiff: Option>, size: usize, order: &Order) -> (Array1, Array2) { - assert_close_l2!(&input, &input.t(), 1e-12); + assert_close_l2!(&input, &input.t(), 1e-4); let (vals, vecs) = match stiff { Some(x) => (input, x).eigh(UPLO::Upper).map(|x| (x.0, (x.1).0)).unwrap(), _ => input.eigh(UPLO::Upper).unwrap() @@ -23,7 +23,7 @@ fn sorted_eig(input: ArrayView, stiff: Option>, si let n = input.len_of(Axis(0)); match order { - Order::Largest => (vals.slice_move(s![n-size..; -1]), vecs.slice_move(s![.., n-size..; -1])), + Order::Largest => (vals.slice_move(s![..size; -1]), vecs.slice_move(s![.., ..size; -1])), Order::Smallest => (vals.slice_move(s![..size]), vecs.slice_move(s![..size, ..])) } } @@ -65,13 +65,14 @@ fn orthonormalize( V: Array2 ) -> (Array2, Array2) { let gram_VV = V.t().dot(&V); - let gram_VV_fac = gram_VV.factorizec(UPLO::Upper).unwrap(); - let gram_VV_fac = gram_VV_fac.into_lower(); + let gram_VV_fac = gram_VV.cholesky(UPLO::Lower).unwrap(); assert_close_l2!(&gram_VV, &gram_VV_fac.dot(&gram_VV_fac.t()), 1e-5); let V_t = V.reversed_axes(); - let U = gram_VV_fac.solve_triangular(UPLO::Lower, Diag::NonUnit, &V_t).unwrap(); + let U = gram_VV_fac.solve_triangular(UPLO::Lower, Diag::NonUnit, &V_t) + .unwrap() + .reversed_axes(); (U, gram_VV_fac) } @@ -99,9 +100,9 @@ pub fn lobpcg( _ => 0 }; - if (n - sizeY) < 5 * sizeX { + /*if (n - sizeY) < 5 * sizeX { panic!("Please use a different approach, the LOBPCG method only supports the calculation of a couple of eigenvectors!"); - } + }*/ let mut iter = usize::min(n, maxiter); @@ -116,14 +117,17 @@ pub fn lobpcg( let (X, _) = orthonormalize(X); let AX = A.dot(&X); let gram_XAX = X.t().dot(&AX); + ////dbg!(&X, &AX, &gram_XAX); // perform eigenvalue decomposition on XAX let (mut lambda, mut eig_block) = sorted_eig(gram_XAX.view(), None, sizeX, &order); + //dbg!(&lambda, &eig_block); // rotate X and AX with eigenvectors let mut X = X.dot(&eig_block); let mut AX = AX.dot(&eig_block); + //dbg!(&X, &AX); let mut activemask = vec![true; sizeX]; let mut residual_norms = Vec::new(); let mut previous_block_size = sizeX; @@ -135,12 +139,17 @@ pub fn lobpcg( let final_norm = loop { // calculate residual - let R = &AX - &X.dot(&lambda); + let lambda_tmp = lambda.clone().insert_axis(Axis(0)); + let tmp = &X * &lambda_tmp; + //dbg!(&X, &lambda_tmp, &tmp); + + let R = &AX - &tmp; // calculate L2 norm for every eigenvector - let tmp = R.genrows().into_iter().map(|x| x.norm()).collect::>(); + let tmp = R.gencolumns().into_iter().map(|x| x.norm()).collect::>(); activemask = tmp.iter().zip(activemask.iter()).map(|(x, a)| *x > tol && *a).collect(); residual_norms.push(tmp.clone()); + //dbg!(&residual_norms); let current_block_size = activemask.iter().filter(|x| **x).count(); if current_block_size != previous_block_size { @@ -169,11 +178,18 @@ pub fn lobpcg( let waw = R.t().dot(&AR); let xw = X.t().dot(&R); - let (gramA, gramB): (Array2, Array2) = if let Some((ref P, ref AP)) = ap { + let (gramA, gramB, active_P, active_AP) = if let Some((ref P, ref AP)) = ap { let active_P = ndarray_mask(P.view(), &activemask); let active_AP = ndarray_mask(AP.view(), &activemask); - let (active_P, R) = orthonormalize(active_P); - let active_AP = R.solve_triangular(UPLO::Lower, Diag::NonUnit, &active_AP).unwrap(); + //dbg!(&active_P, &active_AP); + let (active_P, P_R) = orthonormalize(active_P); + //dbg!(&active_P, &P_R); + let active_AP = P_R.solve_triangular(UPLO::Lower, Diag::NonUnit, &active_AP.reversed_axes()) + .unwrap() + .reversed_axes(); + + //dbg!(&active_AP); + //dbg!(&R); let xap = X.t().dot(&active_AP); let wap = R.t().dot(&active_AP); @@ -192,7 +208,9 @@ pub fn lobpcg( stack![Axis(1), ident0, xw, xp], stack![Axis(1), xw.t(), ident, wp], stack![Axis(1), xp.t(), wp.t(), ident] - ] + ], + Some(active_P), + Some(active_AP) ) } else { ( @@ -203,23 +221,34 @@ pub fn lobpcg( stack![Axis(0), stack![Axis(1), ident0, xw], stack![Axis(1), xw.t(), ident] - ] + ], + None, + None ) }; + //dbg!(&gramA, &gramB); let (new_lambda, eig_vecs) = sorted_eig(gramA.view(), Some(gramB.view()), sizeX, &order); lambda = new_lambda; - let (pp, app, eig_X) = if let Some((ref P, ref AP)) = ap { - let active_P = ndarray_mask(P.view(), &activemask); - let active_AP = ndarray_mask(AP.view(), &activemask); + //dbg!(&lambda, &eig_vecs); + let (pp, app, eig_X) = if let (Some((ref P, ref AP)), (Some(ref active_P), Some(ref active_AP))) = (ap, (active_P, active_AP)) { let eig_X = eig_vecs.slice(s![..sizeX, ..]); let eig_R = eig_vecs.slice(s![sizeX..sizeX+current_block_size, ..]); let eig_P = eig_vecs.slice(s![sizeX+current_block_size.., ..]); - let pp = R.dot(&eig_R) + P.dot(&eig_P); - let app = AR.dot(&eig_R) + AP.dot(&eig_P); + //dbg!(&eig_X); + //dbg!(&eig_R); + //dbg!(&eig_P); + + //dbg!(&R, &AR, &active_P, &active_AP); + + let pp = R.dot(&eig_R) + active_P.dot(&eig_P); + let app = AR.dot(&eig_R) + active_AP.dot(&eig_P); + + //dbg!(&pp); + //dbg!(&app); (pp, app, eig_X) } else { @@ -235,12 +264,17 @@ pub fn lobpcg( X = X.dot(&eig_X) + &pp; AX = AX.dot(&eig_X) + &app; + //dbg!(&X); + //dbg!(&AX); + ap = Some((pp, app)); + //dbg!(&ap); + iter -= 1; }; - dbg!(&final_norm); + dbg!(&residual_norms); (lambda, X) } @@ -250,12 +284,14 @@ mod tests { use super::orthonormalize; use super::ndarray_mask; use super::Order; + use super::lobpcg; use ndarray::prelude::*; use ndarray_linalg::qr::*; use ndarray_rand::RandomExt; use ndarray_rand::rand_distr::Uniform; + use sprs::CsMat; - #[test] + /*#[test] fn test_sorted_eigen() { let matrix = Array2::random((10, 10), Uniform::new(0., 10.)); let matrix = matrix.t().dot(&matrix); @@ -292,6 +328,18 @@ mod tests { assert_close_l2!(&r.mapv(|x| x.abs()) , &l.t().mapv(|x| x.abs()), 1e-5); + }*/ + + #[test] + fn test_eigsolver() { + let X = Array2::random((10, 2), Uniform::new(-1.0, 1.0)); + //let mut X = Array2::ones((10, 1)); + + let diag = arr1(&[1.,2.,3.,4.,5.,6.,7.,8.,9.,10.]); + let A = Array2::from_diag(&diag); + let A = CsMat::csr_from_dense(A.view(), 1e-5); + + dbg!(lobpcg(A, X, None, None, 1e-5, 10, Order::Largest)); } } From e4fede36cf2a58396ee91f89edb60a328b2025d3 Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Fri, 13 Mar 2020 14:57:14 +0100 Subject: [PATCH 06/24] Add tests for LOBPCG --- linfa-clustering/src/spectral/lobpcg.rs | 31 ++++++++++++++----------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/linfa-clustering/src/spectral/lobpcg.rs b/linfa-clustering/src/spectral/lobpcg.rs index e5103ba72..3d0ea7b70 100644 --- a/linfa-clustering/src/spectral/lobpcg.rs +++ b/linfa-clustering/src/spectral/lobpcg.rs @@ -14,7 +14,7 @@ pub enum Order { } fn sorted_eig(input: ArrayView, stiff: Option>, size: usize, order: &Order) -> (Array1, Array2) { - assert_close_l2!(&input, &input.t(), 1e-4); + //assert_close_l2!(&input, &input.t(), 1e-4); let (vals, vecs) = match stiff { Some(x) => (input, x).eigh(UPLO::Upper).map(|x| (x.0, (x.1).0)).unwrap(), _ => input.eigh(UPLO::Upper).unwrap() @@ -22,16 +22,17 @@ fn sorted_eig(input: ArrayView, stiff: Option>, si let n = input.len_of(Axis(0)); + match order { - Order::Largest => (vals.slice_move(s![..size; -1]), vecs.slice_move(s![.., ..size; -1])), - Order::Smallest => (vals.slice_move(s![..size]), vecs.slice_move(s![..size, ..])) + Order::Largest => (vals.slice_move(s![n-size..; -1]), vecs.slice_move(s![.., n-size..; -1])), + Order::Smallest => (vals.slice_move(s![..size]), vecs.slice_move(s![.., ..size])) } } fn ndarray_mask(matrix: ArrayView, mask: &[bool]) -> Array2 { let (rows, cols) = (matrix.rows(), matrix.cols()); - assert!(mask.len() == cols); + assert_eq!(mask.len(), cols); let n_positive = mask.iter().filter(|x| **x).count(); @@ -100,9 +101,9 @@ pub fn lobpcg( _ => 0 }; - /*if (n - sizeY) < 5 * sizeX { + if (n - sizeY) < 5 * sizeX { panic!("Please use a different approach, the LOBPCG method only supports the calculation of a couple of eigenvectors!"); - }*/ + } let mut iter = usize::min(n, maxiter); @@ -169,7 +170,7 @@ pub fn lobpcg( apply_constraints(active_block_R.view_mut(), YY, Y.view()); } - let (R,_) = orthonormalize(R); + let (R,_) = orthonormalize(active_block_R); let AR = A.dot(&R); // perform the Rayleigh Ritz procedure @@ -291,7 +292,7 @@ mod tests { use ndarray_rand::rand_distr::Uniform; use sprs::CsMat; - /*#[test] + #[test] fn test_sorted_eigen() { let matrix = Array2::random((10, 10), Uniform::new(0., 10.)); let matrix = matrix.t().dot(&matrix); @@ -328,18 +329,20 @@ mod tests { assert_close_l2!(&r.mapv(|x| x.abs()) , &l.t().mapv(|x| x.abs()), 1e-5); - }*/ + } #[test] fn test_eigsolver() { - let X = Array2::random((10, 2), Uniform::new(-1.0, 1.0)); - //let mut X = Array2::ones((10, 1)); + let X = Array2::random((20, 3), Uniform::new(-1.0, 1.0)); - let diag = arr1(&[1.,2.,3.,4.,5.,6.,7.,8.,9.,10.]); + let diag = arr1(&[1.,2.,3.,4.,5.,6.,7.,8.,9.,10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20.]); let A = Array2::from_diag(&diag); let A = CsMat::csr_from_dense(A.view(), 1e-5); - dbg!(lobpcg(A, X, None, None, 1e-5, 10, Order::Largest)); - } + let (vals, _) = lobpcg(A.clone(), X.clone(), None, None, 1e-5, 20, Order::Smallest); + assert_close_l2!(&vals, &arr1(&[1.0, 2.0, 3.0]), 1e-5); + let (vals, _) = lobpcg(A, X, None, None, 1e-5, 20, Order::Largest); + assert_close_l2!(&vals, &arr1(&[20.0, 19.0, 18.0]), 1e-5); + } } From c73572479c8b247956a533673d8dbff717491ad4 Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Wed, 18 Mar 2020 10:27:32 +0100 Subject: [PATCH 07/24] Add linfa-reduction library for dimensionality reduction --- Cargo.toml | 5 +- linfa-clustering/Cargo.toml | 1 + linfa-clustering/src/spectral/algorithms.rs | 90 ----- linfa-clustering/src/spectral/lobpcg.rs | 348 ------------------ linfa-reduction/Cargo.toml | 9 + linfa-reduction/examples/spectral.rs | 46 +++ .../src/diffusion_map/algorithms.rs | 72 ++++ .../src/diffusion_map}/hyperparameters.rs | 0 .../src/diffusion_map}/mod.rs | 1 - linfa-reduction/src/lib.rs | 7 + 10 files changed, 139 insertions(+), 440 deletions(-) delete mode 100644 linfa-clustering/src/spectral/algorithms.rs delete mode 100644 linfa-clustering/src/spectral/lobpcg.rs create mode 100644 linfa-reduction/Cargo.toml create mode 100644 linfa-reduction/examples/spectral.rs create mode 100644 linfa-reduction/src/diffusion_map/algorithms.rs rename {linfa-clustering/src/spectral => linfa-reduction/src/diffusion_map}/hyperparameters.rs (100%) rename {linfa-clustering/src/spectral => linfa-reduction/src/diffusion_map}/mod.rs (88%) create mode 100644 linfa-reduction/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index 5bd52a23b..343cf6f64 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,10 @@ rand_isaac = "0.2.0" ndarray-npy = { version = "0.5", default-features = false } [workspace] -members = ["linfa-clustering"] +members = [ + "linfa-clustering", + "linfa-reduction" +] [profile.release] opt-level = 3 diff --git a/linfa-clustering/Cargo.toml b/linfa-clustering/Cargo.toml index 8aa375696..701bef888 100644 --- a/linfa-clustering/Cargo.toml +++ b/linfa-clustering/Cargo.toml @@ -20,6 +20,7 @@ ndarray-stats = "0.3" ndarray-linalg = { path = "../../ndarray-linalg", features = ["openblas"] } sprs = "0.7" serde = { version = "1", features = ["derive"] } +num-traits = "0.1.32" [dev-dependencies] rand_isaac = "0.2.0" diff --git a/linfa-clustering/src/spectral/algorithms.rs b/linfa-clustering/src/spectral/algorithms.rs deleted file mode 100644 index aafa2455d..000000000 --- a/linfa-clustering/src/spectral/algorithms.rs +++ /dev/null @@ -1,90 +0,0 @@ -use ndarray::{ArrayBase, Data, Array1, Ix2, Axis, DataMut}; -use ndarray_rand::rand::Rng; -use ndarray_linalg::{eigh::Eigh, lapack::UPLO}; - -use crate::k_means::*; -use super::hyperparameters::SpectralClusteringHyperParams; - -pub struct SpectralClustering { - hyperparameters: SpectralClusteringHyperParams, - indices: Array1 -} - -impl SpectralClustering { - pub fn fit_predict( - hyperparameters: SpectralClusteringHyperParams, - similarity: ArrayBase, Ix2>, - mut rng: &mut impl Rng - ) -> Self { - // compute spectral embedding with diffusion map - let embedding = compute_diffusion_map(similarity, hyperparameters.steps(), hyperparameters.embedding_size()); - dbg!(&embedding); - - // calculate centroids of this embedding - let conf = KMeansHyperParams::new(hyperparameters.n_clusters()) - .build(); - - let kmeans = KMeans::fit(conf, &embedding, &mut rng); - let indices = kmeans.predict(&embedding); - - SpectralClustering { - hyperparameters, - indices - } - } - - /// Return the hyperparameters used to train this spectral mode instance. - pub fn hyperparameters(&self) -> &SpectralClusteringHyperParams { - &self.hyperparameters - } - - /// Return the indices - pub fn indices(&self) -> Array1 { - self.indices.clone() - } -} - -fn compute_diffusion_map(mut similarity: ArrayBase, Ix2>, steps: usize, embedding_size: usize) -> ArrayBase, Ix2> { - // calculate sum of rows - let d = similarity.sum_axis(Axis(0)) - .mapv(|x| 1.0/x.sqrt()); - - // ensure that matrix is symmetric - for (idx, elm) in similarity.indexed_iter_mut() { - let (a, b) = idx; - *elm *= d[a] * d[b]; - } - - for val in similarity.iter_mut() { - if val.abs() < 1e-2 { - *val = 0.0; - } - } - //dbg!(&similarity);*/ - - // calculate eigenvalue decomposition - let (vals, mut vecs) = similarity.eigh(UPLO::Upper).unwrap(); - - let n_irrelevant = vals.iter().filter(|x| (*x-1.0).abs() < 1e-2).count(); - let embedding_size = usize::min(similarity.len_of(Axis(0)) - n_irrelevant, embedding_size); - let (start, end) = (similarity.len_of(Axis(0)) - embedding_size - n_irrelevant, similarity.len_of(Axis(0)) - n_irrelevant); - - let d = d.mapv(|x| 1.0/x); - - for (idx, elm) in vecs.indexed_iter_mut() { - let (row, _) = idx; - *elm *= d[row]; - } - - // crop eigenvectors to wished embedding dimension - - let vals = vals.slice(s![start..end]); - let mut vecs = vecs.slice(s![start..end, ..]).t().into_owned(); - - for (mut vec, val) in vecs.gencolumns_mut().into_iter().zip(vals.iter()) { - vec *= val.powf(steps as f64); - } - - vecs -} - diff --git a/linfa-clustering/src/spectral/lobpcg.rs b/linfa-clustering/src/spectral/lobpcg.rs deleted file mode 100644 index 3d0ea7b70..000000000 --- a/linfa-clustering/src/spectral/lobpcg.rs +++ /dev/null @@ -1,348 +0,0 @@ -use ndarray::prelude::*; -use ndarray::OwnedRepr; -use ndarray_linalg::cholesky::*; -use ndarray_linalg::triangular::*; -use ndarray_linalg::qr::*; -use ndarray_linalg::eigh::*; -use ndarray_linalg::norm::*; -use crate::ndarray::linalg::Dot; -use sprs::CsMat; - -pub enum Order { - Largest, - Smallest -} - -fn sorted_eig(input: ArrayView, stiff: Option>, size: usize, order: &Order) -> (Array1, Array2) { - //assert_close_l2!(&input, &input.t(), 1e-4); - let (vals, vecs) = match stiff { - Some(x) => (input, x).eigh(UPLO::Upper).map(|x| (x.0, (x.1).0)).unwrap(), - _ => input.eigh(UPLO::Upper).unwrap() - }; - - let n = input.len_of(Axis(0)); - - - match order { - Order::Largest => (vals.slice_move(s![n-size..; -1]), vecs.slice_move(s![.., n-size..; -1])), - Order::Smallest => (vals.slice_move(s![..size]), vecs.slice_move(s![.., ..size])) - } -} - -fn ndarray_mask(matrix: ArrayView, mask: &[bool]) -> Array2 { - let (rows, cols) = (matrix.rows(), matrix.cols()); - - assert_eq!(mask.len(), cols); - - let n_positive = mask.iter().filter(|x| **x).count(); - - let matrix = matrix.gencolumns().into_iter().zip(mask.iter()) - .filter(|(_,x)| **x) - .map(|(x,_)| x.to_vec()) - .flatten() - .collect::>(); - - Array2::from_shape_vec((n_positive, rows), matrix).unwrap().reversed_axes() -} - -fn apply_constraints( - mut V: ArrayViewMut, - fact_YY: &CholeskyFactorized>, - Y: ArrayView -) { - let gram_YV = Y.t().dot(&V); - - let U = gram_YV.genrows().into_iter() - .map(|x| fact_YY.solvec(&x).unwrap().to_vec()) - .flatten() - .collect::>(); - - let U = Array2::from_shape_vec((5, 5), U).unwrap(); - - V -= &(Y.dot(&U)); -} - -fn orthonormalize( - V: Array2 -) -> (Array2, Array2) { - let gram_VV = V.t().dot(&V); - let gram_VV_fac = gram_VV.cholesky(UPLO::Lower).unwrap(); - - assert_close_l2!(&gram_VV, &gram_VV_fac.dot(&gram_VV_fac.t()), 1e-5); - - let V_t = V.reversed_axes(); - let U = gram_VV_fac.solve_triangular(UPLO::Lower, Diag::NonUnit, &V_t) - .unwrap() - .reversed_axes(); - - (U, gram_VV_fac) -} - -pub fn lobpcg( - A: CsMat, - mut X: Array2, - M: Option>, - Y: Option>, - tol: f32, maxiter: usize, - order: Order -) -> (Array1, Array2) { - // the target matrix should be symmetric - assert!(sprs::is_symmetric(&A)); - assert_eq!(A.cols(), A.rows()); - - // the initital approximation should be maximal square - // n is the dimensionality of the problem - let (n, sizeX) = (X.rows(), X.cols()); - assert!(sizeX <= n); - assert_eq!(n, A.cols()); - - let sizeY = match Y { - Some(ref x) => x.cols(), - _ => 0 - }; - - if (n - sizeY) < 5 * sizeX { - panic!("Please use a different approach, the LOBPCG method only supports the calculation of a couple of eigenvectors!"); - } - - let mut iter = usize::min(n, maxiter); - - let mut fact_YY = None; - if let Some(ref Y) = &Y { - let fact_YY_tmp = Y.t().dot(Y).factorizec(UPLO::Upper).unwrap(); - apply_constraints(X.view_mut(), &fact_YY_tmp, Y.view()); - fact_YY = Some(fact_YY_tmp); - } - - // orthonormalize the initial guess and calculate matrices AX and XAX - let (X, _) = orthonormalize(X); - let AX = A.dot(&X); - let gram_XAX = X.t().dot(&AX); - ////dbg!(&X, &AX, &gram_XAX); - - // perform eigenvalue decomposition on XAX - let (mut lambda, mut eig_block) = sorted_eig(gram_XAX.view(), None, sizeX, &order); - //dbg!(&lambda, &eig_block); - - // rotate X and AX with eigenvectors - let mut X = X.dot(&eig_block); - let mut AX = AX.dot(&eig_block); - - //dbg!(&X, &AX); - let mut activemask = vec![true; sizeX]; - let mut residual_norms = Vec::new(); - let mut previous_block_size = sizeX; - - let mut ident: Array2 = Array2::eye(sizeX); - let ident0: Array2 = Array2::eye(sizeX); - - let mut ap: Option<(Array2, Array2)> = None; - - let final_norm = loop { - // calculate residual - let lambda_tmp = lambda.clone().insert_axis(Axis(0)); - let tmp = &X * &lambda_tmp; - //dbg!(&X, &lambda_tmp, &tmp); - - let R = &AX - &tmp; - - // calculate L2 norm for every eigenvector - let tmp = R.gencolumns().into_iter().map(|x| x.norm()).collect::>(); - activemask = tmp.iter().zip(activemask.iter()).map(|(x, a)| *x > tol && *a).collect(); - residual_norms.push(tmp.clone()); - //dbg!(&residual_norms); - - let current_block_size = activemask.iter().filter(|x| **x).count(); - if current_block_size != previous_block_size { - previous_block_size = current_block_size; - ident = Array2::eye(current_block_size); - } - - if current_block_size == 0 || iter == 0 { - break tmp; - } - - let mut active_block_R = ndarray_mask(R.view(), &activemask); - if let Some(ref M) = M { - active_block_R = M.dot(&active_block_R); - } - if let (Some(ref Y), Some(ref YY)) = (&Y, &fact_YY) { - apply_constraints(active_block_R.view_mut(), YY, Y.view()); - } - - let (R,_) = orthonormalize(active_block_R); - let AR = A.dot(&R); - - // perform the Rayleigh Ritz procedure - // compute symmetric gram matrices - let xaw = X.t().dot(&AR); - let waw = R.t().dot(&AR); - let xw = X.t().dot(&R); - - let (gramA, gramB, active_P, active_AP) = if let Some((ref P, ref AP)) = ap { - let active_P = ndarray_mask(P.view(), &activemask); - let active_AP = ndarray_mask(AP.view(), &activemask); - //dbg!(&active_P, &active_AP); - let (active_P, P_R) = orthonormalize(active_P); - //dbg!(&active_P, &P_R); - let active_AP = P_R.solve_triangular(UPLO::Lower, Diag::NonUnit, &active_AP.reversed_axes()) - .unwrap() - .reversed_axes(); - - //dbg!(&active_AP); - //dbg!(&R); - - let xap = X.t().dot(&active_AP); - let wap = R.t().dot(&active_AP); - let pap = active_P.t().dot(&active_AP); - let xp = X.t().dot(&active_P); - let wp = R.t().dot(&active_P); - - ( - stack![Axis(0), - stack![Axis(1), Array2::from_diag(&lambda), xaw, xap], - stack![Axis(1), xaw.t(), waw, wap], - stack![Axis(1), xap.t(), wap.t(), pap] - ], - - stack![Axis(0), - stack![Axis(1), ident0, xw, xp], - stack![Axis(1), xw.t(), ident, wp], - stack![Axis(1), xp.t(), wp.t(), ident] - ], - Some(active_P), - Some(active_AP) - ) - } else { - ( - stack![Axis(0), - stack![Axis(1), Array2::from_diag(&lambda), xaw], - stack![Axis(1), xaw.t(), waw] - ], - stack![Axis(0), - stack![Axis(1), ident0, xw], - stack![Axis(1), xw.t(), ident] - ], - None, - None - ) - }; - - //dbg!(&gramA, &gramB); - let (new_lambda, eig_vecs) = sorted_eig(gramA.view(), Some(gramB.view()), sizeX, &order); - lambda = new_lambda; - - //dbg!(&lambda, &eig_vecs); - let (pp, app, eig_X) = if let (Some((ref P, ref AP)), (Some(ref active_P), Some(ref active_AP))) = (ap, (active_P, active_AP)) { - - let eig_X = eig_vecs.slice(s![..sizeX, ..]); - let eig_R = eig_vecs.slice(s![sizeX..sizeX+current_block_size, ..]); - let eig_P = eig_vecs.slice(s![sizeX+current_block_size.., ..]); - - //dbg!(&eig_X); - //dbg!(&eig_R); - //dbg!(&eig_P); - - //dbg!(&R, &AR, &active_P, &active_AP); - - let pp = R.dot(&eig_R) + active_P.dot(&eig_P); - let app = AR.dot(&eig_R) + active_AP.dot(&eig_P); - - //dbg!(&pp); - //dbg!(&app); - - (pp, app, eig_X) - } else { - let eig_X = eig_vecs.slice(s![..sizeX, ..]); - let eig_R = eig_vecs.slice(s![sizeX.., ..]); - - let pp = R.dot(&eig_R); - let app = AR.dot(&eig_R); - - (pp, app, eig_X) - }; - - X = X.dot(&eig_X) + &pp; - AX = AX.dot(&eig_X) + &app; - - //dbg!(&X); - //dbg!(&AX); - - ap = Some((pp, app)); - - //dbg!(&ap); - - iter -= 1; - }; - - dbg!(&residual_norms); - - (lambda, X) -} - -mod tests { - use super::sorted_eig; - use super::orthonormalize; - use super::ndarray_mask; - use super::Order; - use super::lobpcg; - use ndarray::prelude::*; - use ndarray_linalg::qr::*; - use ndarray_rand::RandomExt; - use ndarray_rand::rand_distr::Uniform; - use sprs::CsMat; - - #[test] - fn test_sorted_eigen() { - let matrix = Array2::random((10, 10), Uniform::new(0., 10.)); - let matrix = matrix.t().dot(&matrix); - - // return all eigenvectors with largest first - let (vals, vecs) = sorted_eig(matrix.view(), None, 10, &Order::Largest); - - // calculate V * A * V' and compare to original matrix - let diag = Array2::from_diag(&vals); - let rec = (vecs.dot(&diag)).dot(&vecs.t()); - - assert_close_l2!(&matrix, &rec, 1e-5); - } - - #[test] - fn test_masking() { - let matrix = Array2::random((10, 5), Uniform::new(0., 10.)); - let masked_matrix = ndarray_mask(matrix.view(), &[true, true, false, true, false]); - assert_close_l2!(&masked_matrix.slice(s![.., 2]), &matrix.slice(s![.., 3]), 1e-12); - } - - #[test] - fn test_orthonormalize() { - let matrix = Array2::random((10, 10), Uniform::new(-10., 10.)); - - let (n, l) = orthonormalize(matrix.clone()); - - // check for orthogonality - let identity = n.dot(&n.t()); - assert_close_l2!(&identity, &Array2::eye(10), 1e-4); - - // compare returned factorization with QR decomposition - let (q, mut r) = matrix.qr().unwrap(); - assert_close_l2!(&r.mapv(|x| x.abs()) , &l.t().mapv(|x| x.abs()), 1e-5); - - - } - - #[test] - fn test_eigsolver() { - let X = Array2::random((20, 3), Uniform::new(-1.0, 1.0)); - - let diag = arr1(&[1.,2.,3.,4.,5.,6.,7.,8.,9.,10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20.]); - let A = Array2::from_diag(&diag); - let A = CsMat::csr_from_dense(A.view(), 1e-5); - - let (vals, _) = lobpcg(A.clone(), X.clone(), None, None, 1e-5, 20, Order::Smallest); - assert_close_l2!(&vals, &arr1(&[1.0, 2.0, 3.0]), 1e-5); - - let (vals, _) = lobpcg(A, X, None, None, 1e-5, 20, Order::Largest); - assert_close_l2!(&vals, &arr1(&[20.0, 19.0, 18.0]), 1e-5); - } -} diff --git a/linfa-reduction/Cargo.toml b/linfa-reduction/Cargo.toml new file mode 100644 index 000000000..7237eaa5d --- /dev/null +++ b/linfa-reduction/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "linfa-reduction" +version = "0.1.0" +authors = ["Lorenz Schmidt "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/linfa-reduction/examples/spectral.rs b/linfa-reduction/examples/spectral.rs new file mode 100644 index 000000000..50b229f9d --- /dev/null +++ b/linfa-reduction/examples/spectral.rs @@ -0,0 +1,46 @@ +use linfa_clustering::{generate_blobs, to_gaussian_similarity, SpectralClustering, SpectralClusteringHyperParams}; +use ndarray::{array, Axis}; +use ndarray_npy::write_npy; +use ndarray_rand::rand::SeedableRng; +use rand_isaac::Isaac64Rng; + +// A routine K-means task: build a synthetic dataset, fit the algorithm on it +// and save both training data and predictions to disk. +fn main() { + // Our random number generator, seeded for reproducibility + let mut rng = Isaac64Rng::seed_from_u64(42); + + // For each our expected centroids, generate `n` data points around it (a "blob") + let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],]; + let n = 5; + let dataset = generate_blobs(n, &expected_centroids, &mut rng); + + let embedding = dataset + .to_similarity(Similarity::Gaussian(50.0)) + .reduce_dimensionality(Method::DiffusionMap) + .unwrap(); + + let similarity = to_gaussian_similarity(&dataset, 50.0); + + // Configure our training algorithm + let n_clusters = expected_centroids.len_of(Axis(0)); + let hyperparams = SpectralClusteringHyperParams::new(n_clusters, 4) + .steps(1) + .build(); + + // Infer an optimal set of centroids based on the training data distribution and assign optimal + // indices to clusters + let cluster_memberships = SpectralClustering::fit_predict(hyperparams, similarity, &mut rng); + let cluster_memberships = cluster_memberships.indices(); + + dbg!(&cluster_memberships); + + // Save to disk our dataset (and the cluster label assigned to each observation) + // We use the `npy` format for compatibility with NumPy + write_npy("spectral_dataset.npy", dataset).expect("Failed to write .npy file"); + write_npy( + "spectral_memberships.npy", + cluster_memberships.map(|&x| x as u64), + ) + .expect("Failed to write .npy file"); +} diff --git a/linfa-reduction/src/diffusion_map/algorithms.rs b/linfa-reduction/src/diffusion_map/algorithms.rs new file mode 100644 index 000000000..3422c8e67 --- /dev/null +++ b/linfa-reduction/src/diffusion_map/algorithms.rs @@ -0,0 +1,72 @@ +use ndarray::{ArrayBase, Data, Array1, Ix2, Axis, DataMut}; +use ndarray_rand::rand::Rng; +use ndarray_linalg::{TruncatedEig, TruncatedOrder, lobpcg::EigResult}; +use super::hyperparameters::DiffusionMapHyperParams; + +pub struct DiffusionMap { + hyperparameters: DiffusionMapHyperParams, + embedding: Array2 +} + +impl DiffusionMap { + pub fn fit_predict( + hyperparameters: DiffusionMapHyperParams, + similarity: ArrayBase, Ix2>, + mut rng: &mut impl Rng + ) -> Self { + // compute spectral embedding with diffusion map + let embedding = compute_diffusion_map(similarity, hyperparameters.steps(), hyperparameters.embedding_size()); + + DiffusionMap { + hyperparameters, + embedding + } + } + + /// Return the hyperparameters used to train this spectral mode instance. + pub fn hyperparameters(&self) -> &DiffusionMapHyperParams { + &self.hyperparameters + } +} + +fn compute_diffusion_map(mut similarity: ArrayBase, Ix2>, steps: usize, embedding_size: usize) -> ArrayBase, Ix2> { + // calculate sum of rows + let d = similarity.sum_axis(Axis(0)) + .mapv(|x| 1.0/x.sqrt()); + + // ensure that matrix is symmetric + for (idx, elm) in similarity.indexed_iter_mut() { + let (a, b) = idx; + *elm *= d[a] * d[b]; + } + + for val in similarity.iter_mut() { + if val.abs() < 1e-2 { + *val = 0.0; + } + } + + // calculate truncated eigenvalue decomposition + let result = TruncatedEig::new(similarity.to_owned(), TruncatedOrder::Largest) + .decompose(embedding_size); + + let (vals, mut vecs) = match result { + EigResult::Ok(vals, vecs, _) | EigResult::Err(vals, vecs, _, _) => (vals, vecs), + _ => panic!("Eigendecomposition failed!") + }; + + let d = d.mapv(|x| 1.0/x); + + for (idx, elm) in vecs.indexed_iter_mut() { + let (row, _) = idx; + *elm *= d[row]; + } + + for (mut vec, val) in vecs.gencolumns_mut().into_iter().zip(vals.iter()) { + vec *= val.powf(steps as f64); + } + + dbg!(&vals); + vecs +} + diff --git a/linfa-clustering/src/spectral/hyperparameters.rs b/linfa-reduction/src/diffusion_map/hyperparameters.rs similarity index 100% rename from linfa-clustering/src/spectral/hyperparameters.rs rename to linfa-reduction/src/diffusion_map/hyperparameters.rs diff --git a/linfa-clustering/src/spectral/mod.rs b/linfa-reduction/src/diffusion_map/mod.rs similarity index 88% rename from linfa-clustering/src/spectral/mod.rs rename to linfa-reduction/src/diffusion_map/mod.rs index 3d48ff32a..2d0728a5d 100644 --- a/linfa-clustering/src/spectral/mod.rs +++ b/linfa-reduction/src/diffusion_map/mod.rs @@ -1,6 +1,5 @@ mod algorithms; mod hyperparameters; -mod lobpcg; pub use algorithms::*; pub use hyperparameters::*; diff --git a/linfa-reduction/src/lib.rs b/linfa-reduction/src/lib.rs new file mode 100644 index 000000000..6afc5a18c --- /dev/null +++ b/linfa-reduction/src/lib.rs @@ -0,0 +1,7 @@ +pub mod diffusion_map; + +pub use diffusion_map::DiffusionMap; + +pub enum Method { + DiffusionMap +} From a6754249c384d3c832431c6c0a68cfda5338ce8d Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Wed, 18 Mar 2020 10:59:16 +0100 Subject: [PATCH 08/24] Repair example for diffusion map dimensionality reduction --- linfa-clustering/src/lib.rs | 6 +-- linfa-clustering/src/utils.rs | 28 +----------- linfa-reduction/Cargo.toml | 23 +++++++++- .../{spectral.rs => diffusion_map.rs} | 29 ++++++------- .../src/diffusion_map/algorithms.rs | 43 +++++++++++++------ .../src/diffusion_map/hyperparameters.rs | 43 +++++-------------- linfa-reduction/src/lib.rs | 6 ++- 7 files changed, 83 insertions(+), 95 deletions(-) rename linfa-reduction/examples/{spectral.rs => diffusion_map.rs} (57%) diff --git a/linfa-clustering/src/lib.rs b/linfa-clustering/src/lib.rs index cc2505571..d0ebc7dd7 100644 --- a/linfa-clustering/src/lib.rs +++ b/linfa-clustering/src/lib.rs @@ -19,16 +19,14 @@ //! //! Check [here](https://github.com/LukeMathWalker/clustering-benchmarks) for extensive benchmarks against `scikit-learn`'s K-means implementation. -#[macro_use] extern crate ndarray; -#[macro_use] extern crate ndarray_linalg; +extern crate ndarray; +extern crate ndarray_linalg; mod dbscan; #[allow(clippy::new_ret_no_self)] mod k_means; -mod spectral; mod utils; pub use dbscan::*; pub use k_means::*; -pub use spectral::*; pub use utils::*; diff --git a/linfa-clustering/src/utils.rs b/linfa-clustering/src/utils.rs index b25ca685e..4ca7f980a 100644 --- a/linfa-clustering/src/utils.rs +++ b/linfa-clustering/src/utils.rs @@ -1,4 +1,4 @@ -use ndarray::{s, Array, Array2, ArrayBase, Data, Ix1, Ix2, Axis}; +use ndarray::{s, Array, Array2, ArrayBase, Data, Ix1, Ix2}; use ndarray_rand::rand::Rng; use ndarray_rand::rand_distr::StandardNormal; use ndarray_rand::RandomExt; @@ -43,29 +43,3 @@ pub fn generate_blob( let origin_blob: Array2 = Array::random_using(shape, StandardNormal, rng); origin_blob + blob_centroid } - -/// Computes a similarity matrix with gaussian kernel and scaling parameter `eps` -/// -/// The generated matrix is a upper triangular matrix with dimension NxN (number of observations) and contains the similarity between all permutations of observations -/// similarity -pub fn to_gaussian_similarity( - observations: &ArrayBase, Ix2>, - eps: f64, -) -> Array2 { - let n_observations = observations.len_of(Axis(0)); - let mut similarity = Array2::eye(n_observations); - - for i in 0..n_observations { - for j in 0..n_observations { - let a = observations.row(i); - let b = observations.row(j); - - let distance = a.iter().zip(b.iter()).map(|(x,y)| (x-y).powf(2.0)) - .sum::(); - - similarity[(i,j)] = (-distance / eps).exp(); - } - } - - similarity -} diff --git a/linfa-reduction/Cargo.toml b/linfa-reduction/Cargo.toml index 7237eaa5d..18c693ff7 100644 --- a/linfa-reduction/Cargo.toml +++ b/linfa-reduction/Cargo.toml @@ -2,8 +2,29 @@ name = "linfa-reduction" version = "0.1.0" authors = ["Lorenz Schmidt "] +description = "A collection of dimensionality reduction techniques" edition = "2018" +license = "MIT/Apache-2.0" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +repository = "https://github.com/LukeMathWalker/linfa" +readme = "README.md" + +keywords = ["dimensionality reduction", "machine-learning", "linfa", "spectral", "unsupervised"] +categories = ["algorithms", "mathematics", "science"] [dependencies] +ndarray = { version = "0.13" , features = ["rayon", "serde", "approx"]} +ndarray-rand = "0.11" +ndarray-stats = "0.3" +ndarray-linalg = { path = "../../ndarray-linalg", features = ["openblas"] } +sprs = "0.7" +serde = { version = "1", features = ["derive"] } +num-traits = "0.1.32" + +[dev-dependencies] +rand_isaac = "0.2.0" +ndarray-npy = { version = "0.5", default-features = false } +criterion = "0.3" +serde_json = "1" +approx = "0.3" +linfa-clustering = { path = "../linfa-clustering" } diff --git a/linfa-reduction/examples/spectral.rs b/linfa-reduction/examples/diffusion_map.rs similarity index 57% rename from linfa-reduction/examples/spectral.rs rename to linfa-reduction/examples/diffusion_map.rs index 50b229f9d..7f101aa7c 100644 --- a/linfa-reduction/examples/spectral.rs +++ b/linfa-reduction/examples/diffusion_map.rs @@ -1,5 +1,6 @@ -use linfa_clustering::{generate_blobs, to_gaussian_similarity, SpectralClustering, SpectralClusteringHyperParams}; -use ndarray::{array, Axis}; +use linfa_clustering::generate_blobs; +use linfa_reduction::{to_gaussian_similarity, DiffusionMap, DiffusionMapHyperParams}; +use ndarray::array; use ndarray_npy::write_npy; use ndarray_rand::rand::SeedableRng; use rand_isaac::Isaac64Rng; @@ -12,35 +13,29 @@ fn main() { // For each our expected centroids, generate `n` data points around it (a "blob") let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],]; - let n = 5; + let n = 10; let dataset = generate_blobs(n, &expected_centroids, &mut rng); - - let embedding = dataset - .to_similarity(Similarity::Gaussian(50.0)) - .reduce_dimensionality(Method::DiffusionMap) - .unwrap(); - let similarity = to_gaussian_similarity(&dataset, 50.0); // Configure our training algorithm - let n_clusters = expected_centroids.len_of(Axis(0)); - let hyperparams = SpectralClusteringHyperParams::new(n_clusters, 4) + let hyperparams = DiffusionMapHyperParams::new(6) .steps(1) .build(); // Infer an optimal set of centroids based on the training data distribution and assign optimal // indices to clusters - let cluster_memberships = SpectralClustering::fit_predict(hyperparams, similarity, &mut rng); - let cluster_memberships = cluster_memberships.indices(); + let diffusion_map = DiffusionMap::project(hyperparams, similarity); + dbg!(&diffusion_map.estimate_clusters()); - dbg!(&cluster_memberships); + let embedding = diffusion_map.embedding(); + dbg!(&embedding); // Save to disk our dataset (and the cluster label assigned to each observation) // We use the `npy` format for compatibility with NumPy - write_npy("spectral_dataset.npy", dataset).expect("Failed to write .npy file"); + write_npy("diffusion_map_dataset.npy", dataset).expect("Failed to write .npy file"); write_npy( - "spectral_memberships.npy", - cluster_memberships.map(|&x| x as u64), + "diffusion_map_embedding.npy", + embedding ) .expect("Failed to write .npy file"); } diff --git a/linfa-reduction/src/diffusion_map/algorithms.rs b/linfa-reduction/src/diffusion_map/algorithms.rs index 3422c8e67..aa921b2e7 100644 --- a/linfa-reduction/src/diffusion_map/algorithms.rs +++ b/linfa-reduction/src/diffusion_map/algorithms.rs @@ -1,25 +1,25 @@ -use ndarray::{ArrayBase, Data, Array1, Ix2, Axis, DataMut}; -use ndarray_rand::rand::Rng; +use ndarray::{ArrayBase, Array1, Array2, Ix2, Axis, DataMut}; use ndarray_linalg::{TruncatedEig, TruncatedOrder, lobpcg::EigResult}; use super::hyperparameters::DiffusionMapHyperParams; pub struct DiffusionMap { hyperparameters: DiffusionMapHyperParams, - embedding: Array2 + embedding: Array2, + eigvals: Array1 } impl DiffusionMap { - pub fn fit_predict( + pub fn project( hyperparameters: DiffusionMapHyperParams, similarity: ArrayBase, Ix2>, - mut rng: &mut impl Rng ) -> Self { // compute spectral embedding with diffusion map - let embedding = compute_diffusion_map(similarity, hyperparameters.steps(), hyperparameters.embedding_size()); + let (embedding, eigvals) = compute_diffusion_map(similarity, hyperparameters.steps(), hyperparameters.embedding_size()); DiffusionMap { hyperparameters, - embedding + embedding, + eigvals } } @@ -27,9 +27,25 @@ impl DiffusionMap { pub fn hyperparameters(&self) -> &DiffusionMapHyperParams { &self.hyperparameters } + + /// Estimate the number of clusters in this embedding (very crude for now) + pub fn estimate_clusters(&self) -> usize { + let mean = self.eigvals.sum() / self.eigvals.len() as f64; + self.eigvals.iter().filter(|x| *x > &mean).count() + 1 + } + + /// Return a copy of the eigenvalues + pub fn eigvals(&self) -> Array1 { + self.eigvals.clone() + } + + /// Return the embedding + pub fn embedding(self) -> Array2 { + self.embedding + } } -fn compute_diffusion_map(mut similarity: ArrayBase, Ix2>, steps: usize, embedding_size: usize) -> ArrayBase, Ix2> { +fn compute_diffusion_map(mut similarity: ArrayBase, Ix2>, steps: usize, embedding_size: usize) -> (Array2, Array1) { // calculate sum of rows let d = similarity.sum_axis(Axis(0)) .mapv(|x| 1.0/x.sqrt()); @@ -48,13 +64,17 @@ fn compute_diffusion_map(mut similarity: ArrayBase, Ix2 // calculate truncated eigenvalue decomposition let result = TruncatedEig::new(similarity.to_owned(), TruncatedOrder::Largest) - .decompose(embedding_size); + .decompose(embedding_size + 1); - let (vals, mut vecs) = match result { + let (vals, vecs) = match result { EigResult::Ok(vals, vecs, _) | EigResult::Err(vals, vecs, _, _) => (vals, vecs), _ => panic!("Eigendecomposition failed!") }; + // cut away first eigenvalue/eigenvector + let vals = vals.slice_move(s![1..]); + let mut vecs = vecs.slice_move(s![..,1..]); + let d = d.mapv(|x| 1.0/x); for (idx, elm) in vecs.indexed_iter_mut() { @@ -66,7 +86,6 @@ fn compute_diffusion_map(mut similarity: ArrayBase, Ix2 vec *= val.powf(steps as f64); } - dbg!(&vals); - vecs + (vecs, vals) } diff --git a/linfa-reduction/src/diffusion_map/hyperparameters.rs b/linfa-reduction/src/diffusion_map/hyperparameters.rs index 20e7920f2..c35dfad30 100644 --- a/linfa-reduction/src/diffusion_map/hyperparameters.rs +++ b/linfa-reduction/src/diffusion_map/hyperparameters.rs @@ -1,49 +1,32 @@ -pub enum Clustering { - KMeans -} -pub struct SpectralClusteringHyperParams { - n_clusters: usize, +pub struct DiffusionMapHyperParams { steps: usize, embedding_size: usize, - clustering: Clustering, } -pub struct SpectralClusteringHyperParamsBuilder { - n_clusters: usize, +pub struct DiffusionMapHyperParamsBuilder { steps: usize, embedding_size: usize, - clustering: Clustering, } -impl SpectralClusteringHyperParamsBuilder { +impl DiffusionMapHyperParamsBuilder { pub fn steps(mut self, steps: usize) -> Self { self.steps = steps; self } - pub fn clustering(mut self, clustering: Clustering) -> Self { - self.clustering = clustering; - - self - } - - pub fn build(self) -> SpectralClusteringHyperParams { - SpectralClusteringHyperParams::build( - self.n_clusters, + pub fn build(self) -> DiffusionMapHyperParams { + DiffusionMapHyperParams::build( self.steps, self.embedding_size, - self.clustering ) } } -impl SpectralClusteringHyperParams { - pub fn new(n_clusters: usize, embedding_size: usize) -> SpectralClusteringHyperParamsBuilder { - SpectralClusteringHyperParamsBuilder { +impl DiffusionMapHyperParams { + pub fn new(embedding_size: usize) -> DiffusionMapHyperParamsBuilder { + DiffusionMapHyperParamsBuilder { steps: 10, - clustering: Clustering::KMeans, - n_clusters, embedding_size } } @@ -52,20 +35,14 @@ impl SpectralClusteringHyperParams { self.steps } - pub fn n_clusters(&self) -> usize { - self.n_clusters - } - pub fn embedding_size(&self) -> usize { self.embedding_size } - pub fn build(n_clusters: usize, steps: usize, embedding_size: usize, clustering: Clustering) -> Self { - SpectralClusteringHyperParams { + pub fn build(steps: usize, embedding_size: usize) -> Self { + DiffusionMapHyperParams { steps, - n_clusters, embedding_size, - clustering } } } diff --git a/linfa-reduction/src/lib.rs b/linfa-reduction/src/lib.rs index 6afc5a18c..40942e6d6 100644 --- a/linfa-reduction/src/lib.rs +++ b/linfa-reduction/src/lib.rs @@ -1,6 +1,10 @@ +#[macro_use] extern crate ndarray; + pub mod diffusion_map; +pub mod utils; -pub use diffusion_map::DiffusionMap; +pub use diffusion_map::{DiffusionMap, DiffusionMapHyperParams}; +pub use utils::to_gaussian_similarity; pub enum Method { DiffusionMap From 3c977ead7cd29a73a175a2cef9dc262df4b9948f Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Thu, 19 Mar 2020 16:43:12 +0100 Subject: [PATCH 09/24] Add simple principal component analysis --- linfa-reduction/examples/pca.rs | 34 +++++++++++++++ linfa-reduction/src/lib.rs | 2 + linfa-reduction/src/pca/algorithms.rs | 43 +++++++++++++++++++ linfa-reduction/src/pca/hyperparameters.rs | 48 ++++++++++++++++++++++ linfa-reduction/src/pca/mod.rs | 5 +++ linfa-reduction/src/utils.rs | 27 ++++++++++++ 6 files changed, 159 insertions(+) create mode 100644 linfa-reduction/examples/pca.rs create mode 100644 linfa-reduction/src/pca/algorithms.rs create mode 100644 linfa-reduction/src/pca/hyperparameters.rs create mode 100644 linfa-reduction/src/pca/mod.rs create mode 100644 linfa-reduction/src/utils.rs diff --git a/linfa-reduction/examples/pca.rs b/linfa-reduction/examples/pca.rs new file mode 100644 index 000000000..834ed6cd8 --- /dev/null +++ b/linfa-reduction/examples/pca.rs @@ -0,0 +1,34 @@ +use linfa_clustering::generate_blobs; +use linfa_reduction::PrincipalComponentAnalysis; +use ndarray::array; +use ndarray_npy::write_npy; +use ndarray_rand::rand::SeedableRng; +use rand_isaac::Isaac64Rng; + +// A routine K-means task: build a synthetic dataset, fit the algorithm on it +// and save both training data and predictions to disk. +fn main() { + // Our random number generator, seeded for reproducibility + let mut rng = Isaac64Rng::seed_from_u64(42); + + // For each our expected centroids, generate `n` data points around it (a "blob") + let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],]; + let n = 10; + let dataset = generate_blobs(n, &expected_centroids, &mut rng); + + dbg!(&dataset); + + let embedding = PrincipalComponentAnalysis::fit(dataset.clone(), 1) + .predict(&dataset); + + dbg!(&embedding); + + // Save to disk our dataset (and the cluster label assigned to each observation) + // We use the `npy` format for compatibility with NumPy + write_npy("pca_dataset.npy", dataset).expect("Failed to write .npy file"); + write_npy( + "pca_embedding.npy", + embedding + ) + .expect("Failed to write .npy file"); +} diff --git a/linfa-reduction/src/lib.rs b/linfa-reduction/src/lib.rs index 40942e6d6..31f76770d 100644 --- a/linfa-reduction/src/lib.rs +++ b/linfa-reduction/src/lib.rs @@ -1,8 +1,10 @@ #[macro_use] extern crate ndarray; pub mod diffusion_map; +pub mod pca; pub mod utils; +pub use pca::PrincipalComponentAnalysis; pub use diffusion_map::{DiffusionMap, DiffusionMapHyperParams}; pub use utils::to_gaussian_similarity; diff --git a/linfa-reduction/src/pca/algorithms.rs b/linfa-reduction/src/pca/algorithms.rs new file mode 100644 index 000000000..67da32159 --- /dev/null +++ b/linfa-reduction/src/pca/algorithms.rs @@ -0,0 +1,43 @@ +use ndarray::{ArrayBase, Array1, Array2, Ix2, Axis, DataMut}; +use ndarray_linalg::{TruncatedSvd, TruncatedOrder}; + +pub struct PrincipalComponentAnalysis { + embedding: Array2, + explained_variance: Array1, + mean: Array1 +} + +impl PrincipalComponentAnalysis { + pub fn fit>( + mut dataset: ArrayBase, + embedding_size: usize + ) -> Self { + let mean = dataset.mean_axis(Axis(0)).unwrap(); + + dataset -= &mean; + + let result = TruncatedSvd::new(dataset.to_owned(), TruncatedOrder::Largest) + .decompose(embedding_size) + .unwrap(); + + let (_, sigma, v_t) = result.values_vectors(); + let explained_variance = sigma.mapv(|x| x*x / dataset.len() as f64); + + PrincipalComponentAnalysis { + embedding: v_t, + explained_variance, + mean + } + } + + pub fn predict>( + &self, + dataset: &ArrayBase + ) -> Array2 { + (dataset - &self.mean).dot(&self.embedding.t()) + } + + pub fn explained_variance(&self) -> Array1 { + self.explained_variance.clone() + } +} diff --git a/linfa-reduction/src/pca/hyperparameters.rs b/linfa-reduction/src/pca/hyperparameters.rs new file mode 100644 index 000000000..c35dfad30 --- /dev/null +++ b/linfa-reduction/src/pca/hyperparameters.rs @@ -0,0 +1,48 @@ +pub struct DiffusionMapHyperParams { + steps: usize, + embedding_size: usize, +} + +pub struct DiffusionMapHyperParamsBuilder { + steps: usize, + embedding_size: usize, +} + +impl DiffusionMapHyperParamsBuilder { + pub fn steps(mut self, steps: usize) -> Self { + self.steps = steps; + + self + } + + pub fn build(self) -> DiffusionMapHyperParams { + DiffusionMapHyperParams::build( + self.steps, + self.embedding_size, + ) + } +} + +impl DiffusionMapHyperParams { + pub fn new(embedding_size: usize) -> DiffusionMapHyperParamsBuilder { + DiffusionMapHyperParamsBuilder { + steps: 10, + embedding_size + } + } + + pub fn steps(&self) -> usize { + self.steps + } + + pub fn embedding_size(&self) -> usize { + self.embedding_size + } + + pub fn build(steps: usize, embedding_size: usize) -> Self { + DiffusionMapHyperParams { + steps, + embedding_size, + } + } +} diff --git a/linfa-reduction/src/pca/mod.rs b/linfa-reduction/src/pca/mod.rs new file mode 100644 index 000000000..2d0728a5d --- /dev/null +++ b/linfa-reduction/src/pca/mod.rs @@ -0,0 +1,5 @@ +mod algorithms; +mod hyperparameters; + +pub use algorithms::*; +pub use hyperparameters::*; diff --git a/linfa-reduction/src/utils.rs b/linfa-reduction/src/utils.rs new file mode 100644 index 000000000..830b195f4 --- /dev/null +++ b/linfa-reduction/src/utils.rs @@ -0,0 +1,27 @@ +use ndarray::{Array2, Ix2, Data, Axis, ArrayBase}; + +/// Computes a similarity matrix with gaussian kernel and scaling parameter `eps` +/// +/// The generated matrix is a upper triangular matrix with dimension NxN (number of observations) and contains the similarity between all permutations of observations +/// similarity +pub fn to_gaussian_similarity( + observations: &ArrayBase, Ix2>, + eps: f64, +) -> Array2 { + let n_observations = observations.len_of(Axis(0)); + let mut similarity = Array2::eye(n_observations); + + for i in 0..n_observations { + for j in 0..n_observations { + let a = observations.row(i); + let b = observations.row(j); + + let distance = a.iter().zip(b.iter()).map(|(x,y)| (x-y).powf(2.0)) + .sum::(); + + similarity[(i,j)] = (-distance / eps).exp(); + } + } + + similarity +} From 58e4d81284110eea3d0250761b4617c0a8cb42d2 Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Thu, 19 Mar 2020 16:47:10 +0100 Subject: [PATCH 10/24] Remove hyperparameter file from PCA --- linfa-reduction/src/lib.rs | 3 +- linfa-reduction/src/pca/hyperparameters.rs | 48 ---------------------- linfa-reduction/src/pca/mod.rs | 2 - 3 files changed, 2 insertions(+), 51 deletions(-) delete mode 100644 linfa-reduction/src/pca/hyperparameters.rs diff --git a/linfa-reduction/src/lib.rs b/linfa-reduction/src/lib.rs index 31f76770d..f2ef14eeb 100644 --- a/linfa-reduction/src/lib.rs +++ b/linfa-reduction/src/lib.rs @@ -9,5 +9,6 @@ pub use diffusion_map::{DiffusionMap, DiffusionMapHyperParams}; pub use utils::to_gaussian_similarity; pub enum Method { - DiffusionMap + DiffusionMap, + PrincipalComponentAnalysis } diff --git a/linfa-reduction/src/pca/hyperparameters.rs b/linfa-reduction/src/pca/hyperparameters.rs deleted file mode 100644 index c35dfad30..000000000 --- a/linfa-reduction/src/pca/hyperparameters.rs +++ /dev/null @@ -1,48 +0,0 @@ -pub struct DiffusionMapHyperParams { - steps: usize, - embedding_size: usize, -} - -pub struct DiffusionMapHyperParamsBuilder { - steps: usize, - embedding_size: usize, -} - -impl DiffusionMapHyperParamsBuilder { - pub fn steps(mut self, steps: usize) -> Self { - self.steps = steps; - - self - } - - pub fn build(self) -> DiffusionMapHyperParams { - DiffusionMapHyperParams::build( - self.steps, - self.embedding_size, - ) - } -} - -impl DiffusionMapHyperParams { - pub fn new(embedding_size: usize) -> DiffusionMapHyperParamsBuilder { - DiffusionMapHyperParamsBuilder { - steps: 10, - embedding_size - } - } - - pub fn steps(&self) -> usize { - self.steps - } - - pub fn embedding_size(&self) -> usize { - self.embedding_size - } - - pub fn build(steps: usize, embedding_size: usize) -> Self { - DiffusionMapHyperParams { - steps, - embedding_size, - } - } -} diff --git a/linfa-reduction/src/pca/mod.rs b/linfa-reduction/src/pca/mod.rs index 2d0728a5d..60025f396 100644 --- a/linfa-reduction/src/pca/mod.rs +++ b/linfa-reduction/src/pca/mod.rs @@ -1,5 +1,3 @@ mod algorithms; -mod hyperparameters; pub use algorithms::*; -pub use hyperparameters::*; From 3b1241368d4e93fd92cff767d184e20f5c039743 Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Sun, 22 Mar 2020 21:44:33 +0100 Subject: [PATCH 11/24] Add kernel traits to unify dense and sparse matrices --- linfa-reduction/src/kernel/mod.rs | 97 +++++++++++++++++++++++++++++++ linfa-reduction/src/lib.rs | 1 + 2 files changed, 98 insertions(+) create mode 100644 linfa-reduction/src/kernel/mod.rs diff --git a/linfa-reduction/src/kernel/mod.rs b/linfa-reduction/src/kernel/mod.rs new file mode 100644 index 000000000..93294a8f0 --- /dev/null +++ b/linfa-reduction/src/kernel/mod.rs @@ -0,0 +1,97 @@ +use ndarray::prelude::*; +use ndarray::Data; +use ndarray::NdFloat; +use num_traits::{FromPrimitive, Num, NumCast}; +use sprs::CsMat; + +pub enum Axis { + Column, + Row +} + +pub trait SimilarityKernel {} + +pub trait Kernel { + fn apply_gram>(&self, rhs: ArrayBase) -> Array2; + fn mean(&self, axis: Axis) -> Array1; + fn n_features(&self) -> usize; +} + +impl, A: NdFloat + 'static + FromPrimitive> Kernel for ArrayBase { + fn apply_gram>(&self, rhs: ArrayBase) -> Array2 { + self.dot(&rhs) + } + + fn mean(&self, axis: Axis) -> Array1 { + match axis { + Axis::Row => self.mean_axis(ndarray::Axis(0)).unwrap(), + Axis::Column => self.mean_axis(ndarray::Axis(1)).unwrap() + } + } + + fn n_features(&self) -> usize { + self.ncols() + } +} + +impl Kernel for CsMat { + fn apply_gram>(&self, rhs: ArrayBase) -> Array2 { + self * &rhs + } + + fn mean(&self, axis: Axis) -> Array1 { + let mut mean: Array1 = match axis { + Axis::Column => Array1::zeros(self.cols()), + Axis::Row => Array1::zeros(self.rows()) + }; + + for (val, (row, col)) in self.iter() { + match axis { + Axis::Column => mean[col] += *val, + Axis::Row => mean[row] += *val + } + } + + let cols: A = NumCast::from(self.cols()).unwrap(); + let rows: A = NumCast::from(self.rows()).unwrap(); + + match axis { + Axis::Column => mean / rows, + Axis::Row => mean / cols + } + } + + fn n_features(&self) -> usize { + self.cols() + } +} + +#[cfg(test)] +mod tests { + use super::{Kernel, Axis}; + use sprs::CsMatBase; + use ndarray::{Array1, Array2}; + use ndarray_rand::{rand_distr::Uniform, RandomExt}; + + #[test] + fn test_sprs() { + let a: Array2 = Array2::random((10, 5), Uniform::new(0., 1.)); + let a = CsMatBase::csr_from_dense(a.view(), 1e-5); + + assert_eq!(a.mean(Axis::Column).shape(), &[5]); + assert_eq!(a.mean(Axis::Row).shape(), &[10]); + assert_eq!(a.n_features(), 5); + + assert_eq!(a.apply_gram(Array2::eye(5)), a.to_dense()); + } + + #[test] + fn test_dense() { + let id: Array2 = Array2::eye(10); + + assert_eq!(Kernel::mean(&id, Axis::Column), Array1::ones(10) * 0.1); + assert_eq!(Kernel::mean(&id, Axis::Row), Array1::ones(10) * 0.1); + + assert_eq!(id.n_features(), 10); + } +} diff --git a/linfa-reduction/src/lib.rs b/linfa-reduction/src/lib.rs index f2ef14eeb..cb5aea654 100644 --- a/linfa-reduction/src/lib.rs +++ b/linfa-reduction/src/lib.rs @@ -1,5 +1,6 @@ #[macro_use] extern crate ndarray; +pub mod kernel; pub mod diffusion_map; pub mod pca; pub mod utils; From fe56fec40dbf6b7b189eceac15a225477a8dc387 Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Mon, 23 Mar 2020 17:36:29 +0100 Subject: [PATCH 12/24] Only symmetric kernel functions are allowed --- linfa-reduction/src/kernel/mod.rs | 60 +++++++++++++------------------ 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/linfa-reduction/src/kernel/mod.rs b/linfa-reduction/src/kernel/mod.rs index 93294a8f0..bb84b3df0 100644 --- a/linfa-reduction/src/kernel/mod.rs +++ b/linfa-reduction/src/kernel/mod.rs @@ -4,17 +4,16 @@ use ndarray::NdFloat; use num_traits::{FromPrimitive, Num, NumCast}; use sprs::CsMat; -pub enum Axis { - Column, - Row -} - -pub trait SimilarityKernel {} - +/// Symmetric kernel function, can be sparse or dense +/// +/// Required functions are: +/// * `apply_gram`: performs a matrix multiplication with `rhs` +/// * `sum`: returns the sum of columns or rows +/// * `size`: returns the number of data-points pub trait Kernel { fn apply_gram>(&self, rhs: ArrayBase) -> Array2; - fn mean(&self, axis: Axis) -> Array1; - fn n_features(&self) -> usize; + fn sum(&self) -> Array1; + fn size(&self) -> usize; } impl, A: NdFloat + 'static + FromPrimitive> Kernel for ArrayBase { @@ -22,14 +21,13 @@ impl, A: NdFloat + 'static + FromPrimitive> Kernel for Arra self.dot(&rhs) } - fn mean(&self, axis: Axis) -> Array1 { - match axis { - Axis::Row => self.mean_axis(ndarray::Axis(0)).unwrap(), - Axis::Column => self.mean_axis(ndarray::Axis(1)).unwrap() - } + fn sum(&self) -> Array1 { + self.mean_axis(ndarray::Axis(0)).unwrap() } - fn n_features(&self) -> usize { + fn size(&self) -> usize { + assert_eq!(self.ncols(), self.nrows()); + self.ncols() } } @@ -39,29 +37,19 @@ impl Kernel for CsMat Array1 { - let mut mean: Array1 = match axis { - Axis::Column => Array1::zeros(self.cols()), - Axis::Row => Array1::zeros(self.rows()) - }; - - for (val, (row, col)) in self.iter() { - match axis { - Axis::Column => mean[col] += *val, - Axis::Row => mean[row] += *val - } - } + fn sum(&self) -> Array1 { + let mut mean = Array1::zeros(self.cols()); - let cols: A = NumCast::from(self.cols()).unwrap(); - let rows: A = NumCast::from(self.rows()).unwrap(); - - match axis { - Axis::Column => mean / rows, - Axis::Row => mean / cols + for (val, (_, col)) in self.iter() { + mean[col] += *val; } + + mean } - fn n_features(&self) -> usize { + fn size(&self) -> usize { + assert_eq!(self.cols(), self.rows()); + self.cols() } } @@ -73,7 +61,7 @@ mod tests { use ndarray::{Array1, Array2}; use ndarray_rand::{rand_distr::Uniform, RandomExt}; - #[test] + /*#[test] fn test_sprs() { let a: Array2 = Array2::random((10, 5), Uniform::new(0., 1.)); let a = CsMatBase::csr_from_dense(a.view(), 1e-5); @@ -93,5 +81,5 @@ mod tests { assert_eq!(Kernel::mean(&id, Axis::Row), Array1::ones(10) * 0.1); assert_eq!(id.n_features(), 10); - } + }*/ } From 17658340d677323a0d00301df394e54bc52695d3 Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Mon, 23 Mar 2020 19:44:45 +0100 Subject: [PATCH 13/24] Use kernel trait in diffusion maps --- .../src/diffusion_map/algorithms.rs | 32 +++++++++++++------ linfa-reduction/src/kernel/mod.rs | 25 +++++++-------- 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/linfa-reduction/src/diffusion_map/algorithms.rs b/linfa-reduction/src/diffusion_map/algorithms.rs index aa921b2e7..ac546422a 100644 --- a/linfa-reduction/src/diffusion_map/algorithms.rs +++ b/linfa-reduction/src/diffusion_map/algorithms.rs @@ -1,5 +1,8 @@ use ndarray::{ArrayBase, Array1, Array2, Ix2, Axis, DataMut}; -use ndarray_linalg::{TruncatedEig, TruncatedOrder, lobpcg::EigResult}; +use ndarray_linalg::{TruncatedEig, TruncatedOrder, lobpcg::LobpcgResult, lobpcg}; +use ndarray_rand::{rand_distr::Uniform, RandomExt}; + +use crate::kernel::Kernel; use super::hyperparameters::DiffusionMapHyperParams; pub struct DiffusionMap { @@ -11,10 +14,10 @@ pub struct DiffusionMap { impl DiffusionMap { pub fn project( hyperparameters: DiffusionMapHyperParams, - similarity: ArrayBase, Ix2>, + kernel: impl Kernel ) -> Self { // compute spectral embedding with diffusion map - let (embedding, eigvals) = compute_diffusion_map(similarity, hyperparameters.steps(), hyperparameters.embedding_size()); + let (embedding, eigvals) = compute_diffusion_map(kernel, hyperparameters.steps(), hyperparameters.embedding_size()); DiffusionMap { hyperparameters, @@ -45,13 +48,13 @@ impl DiffusionMap { } } -fn compute_diffusion_map(mut similarity: ArrayBase, Ix2>, steps: usize, embedding_size: usize) -> (Array2, Array1) { +fn compute_diffusion_map(kernel: impl Kernel, steps: usize, embedding_size: usize) -> (Array2, Array1) { // calculate sum of rows - let d = similarity.sum_axis(Axis(0)) + let d = kernel.sum() .mapv(|x| 1.0/x.sqrt()); // ensure that matrix is symmetric - for (idx, elm) in similarity.indexed_iter_mut() { + /*for (idx, elm) in similarity.indexed_iter_mut() { let (a, b) = idx; *elm *= d[a] * d[b]; } @@ -60,17 +63,26 @@ fn compute_diffusion_map(mut similarity: ArrayBase, Ix2 if val.abs() < 1e-2 { *val = 0.0; } - } + }*/ // calculate truncated eigenvalue decomposition - let result = TruncatedEig::new(similarity.to_owned(), TruncatedOrder::Largest) - .decompose(embedding_size + 1); + let x = Array2::random((d.len(), embedding_size + 1), Uniform::new(0.0, 1.0)); + + let result = lobpcg::lobpcg(|y| { + let mut y = y.to_owned(); + y.gencolumns_mut().into_iter().zip(d.iter()).for_each(|(mut c, x)| c *= *x); + let mut y = kernel.apply_gram(y); + y.genrows_mut().into_iter().zip(d.iter()).for_each(|(mut c, x)| c *= *x); + + y + }, x, |_| {}, None, 1e-5, 20, TruncatedOrder::Largest); let (vals, vecs) = match result { - EigResult::Ok(vals, vecs, _) | EigResult::Err(vals, vecs, _, _) => (vals, vecs), + LobpcgResult::Ok(vals, vecs, _) | LobpcgResult::Err(vals, vecs, _, _) => (vals, vecs), _ => panic!("Eigendecomposition failed!") }; + dbg!(&vals); // cut away first eigenvalue/eigenvector let vals = vals.slice_move(s![1..]); let mut vecs = vecs.slice_move(s![..,1..]); diff --git a/linfa-reduction/src/kernel/mod.rs b/linfa-reduction/src/kernel/mod.rs index bb84b3df0..84c4131cb 100644 --- a/linfa-reduction/src/kernel/mod.rs +++ b/linfa-reduction/src/kernel/mod.rs @@ -1,7 +1,7 @@ use ndarray::prelude::*; use ndarray::Data; use ndarray::NdFloat; -use num_traits::{FromPrimitive, Num, NumCast}; +use num_traits::{FromPrimitive, Num}; use sprs::CsMat; /// Symmetric kernel function, can be sparse or dense @@ -22,7 +22,7 @@ impl, A: NdFloat + 'static + FromPrimitive> Kernel for Arra } fn sum(&self) -> Array1 { - self.mean_axis(ndarray::Axis(0)).unwrap() + self.sum_axis(ndarray::Axis(0)) } fn size(&self) -> usize { @@ -56,30 +56,27 @@ impl Kernel for CsMat = Array2::random((10, 5), Uniform::new(0., 1.)); + let a: Array2 = Array2::random((10, 10), Uniform::new(0., 1.)); let a = CsMatBase::csr_from_dense(a.view(), 1e-5); - assert_eq!(a.mean(Axis::Column).shape(), &[5]); - assert_eq!(a.mean(Axis::Row).shape(), &[10]); - assert_eq!(a.n_features(), 5); - - assert_eq!(a.apply_gram(Array2::eye(5)), a.to_dense()); + assert_eq!(a.size(), 10); + assert_eq!(a.apply_gram(Array2::eye(10)), a.to_dense()); } #[test] fn test_dense() { let id: Array2 = Array2::eye(10); - assert_eq!(Kernel::mean(&id, Axis::Column), Array1::ones(10) * 0.1); - assert_eq!(Kernel::mean(&id, Axis::Row), Array1::ones(10) * 0.1); + assert_eq!(Kernel::sum(&id), Array1::ones(10)); + assert_eq!(Kernel::sum(&id), Array1::ones(10)); - assert_eq!(id.n_features(), 10); - }*/ + assert_eq!(id.size(), 10); + } } From fda3f6d101989ea745ae3bd397eb97c1232aefc5 Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Mon, 23 Mar 2020 19:51:51 +0100 Subject: [PATCH 14/24] Apply D on rows, not columns --- linfa-reduction/examples/diffusion_map.rs | 4 ++-- linfa-reduction/src/diffusion_map/algorithms.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/linfa-reduction/examples/diffusion_map.rs b/linfa-reduction/examples/diffusion_map.rs index 7f101aa7c..33db4173a 100644 --- a/linfa-reduction/examples/diffusion_map.rs +++ b/linfa-reduction/examples/diffusion_map.rs @@ -15,10 +15,10 @@ fn main() { let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],]; let n = 10; let dataset = generate_blobs(n, &expected_centroids, &mut rng); - let similarity = to_gaussian_similarity(&dataset, 50.0); + let similarity = to_gaussian_similarity(&dataset, 20.0); // Configure our training algorithm - let hyperparams = DiffusionMapHyperParams::new(6) + let hyperparams = DiffusionMapHyperParams::new(4) .steps(1) .build(); diff --git a/linfa-reduction/src/diffusion_map/algorithms.rs b/linfa-reduction/src/diffusion_map/algorithms.rs index ac546422a..a2c5305a0 100644 --- a/linfa-reduction/src/diffusion_map/algorithms.rs +++ b/linfa-reduction/src/diffusion_map/algorithms.rs @@ -70,7 +70,7 @@ fn compute_diffusion_map(kernel: impl Kernel, steps: usize, embedding_size: let result = lobpcg::lobpcg(|y| { let mut y = y.to_owned(); - y.gencolumns_mut().into_iter().zip(d.iter()).for_each(|(mut c, x)| c *= *x); + y.genrows_mut().into_iter().zip(d.iter()).for_each(|(mut c, x)| c *= *x); let mut y = kernel.apply_gram(y); y.genrows_mut().into_iter().zip(d.iter()).for_each(|(mut c, x)| c *= *x); @@ -81,8 +81,8 @@ fn compute_diffusion_map(kernel: impl Kernel, steps: usize, embedding_size: LobpcgResult::Ok(vals, vecs, _) | LobpcgResult::Err(vals, vecs, _, _) => (vals, vecs), _ => panic!("Eigendecomposition failed!") }; - dbg!(&vals); + // cut away first eigenvalue/eigenvector let vals = vals.slice_move(s![1..]); let mut vecs = vecs.slice_move(s![..,1..]); From aff9fa2fe2ea8b588a24e57921fcb54feea21e8f Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Tue, 24 Mar 2020 12:43:51 +0100 Subject: [PATCH 15/24] Use full eigenvalue decomposition for small problems --- linfa-reduction/examples/diffusion_map.rs | 6 +- .../src/diffusion_map/algorithms.rs | 71 ++++++++++--------- 2 files changed, 41 insertions(+), 36 deletions(-) diff --git a/linfa-reduction/examples/diffusion_map.rs b/linfa-reduction/examples/diffusion_map.rs index 33db4173a..9462df165 100644 --- a/linfa-reduction/examples/diffusion_map.rs +++ b/linfa-reduction/examples/diffusion_map.rs @@ -13,9 +13,11 @@ fn main() { // For each our expected centroids, generate `n` data points around it (a "blob") let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],]; - let n = 10; + let n = 20; let dataset = generate_blobs(n, &expected_centroids, &mut rng); - let similarity = to_gaussian_similarity(&dataset, 20.0); + let similarity = to_gaussian_similarity(&dataset, 40.0); + + let kernel = GaussianKernel::new(dataset, 40); // Configure our training algorithm let hyperparams = DiffusionMapHyperParams::new(4) diff --git a/linfa-reduction/src/diffusion_map/algorithms.rs b/linfa-reduction/src/diffusion_map/algorithms.rs index a2c5305a0..26fddc4ba 100644 --- a/linfa-reduction/src/diffusion_map/algorithms.rs +++ b/linfa-reduction/src/diffusion_map/algorithms.rs @@ -1,5 +1,5 @@ use ndarray::{ArrayBase, Array1, Array2, Ix2, Axis, DataMut}; -use ndarray_linalg::{TruncatedEig, TruncatedOrder, lobpcg::LobpcgResult, lobpcg}; +use ndarray_linalg::{TruncatedEig, TruncatedOrder, lobpcg::LobpcgResult, lobpcg, eigh::EighInto, lapack::UPLO, close_l2}; use ndarray_rand::{rand_distr::Uniform, RandomExt}; use crate::kernel::Kernel; @@ -49,44 +49,47 @@ impl DiffusionMap { } fn compute_diffusion_map(kernel: impl Kernel, steps: usize, embedding_size: usize) -> (Array2, Array1) { + assert!(embedding_size < kernel.size()); + // calculate sum of rows let d = kernel.sum() .mapv(|x| 1.0/x.sqrt()); - // ensure that matrix is symmetric - /*for (idx, elm) in similarity.indexed_iter_mut() { - let (a, b) = idx; - *elm *= d[a] * d[b]; - } - - for val in similarity.iter_mut() { - if val.abs() < 1e-2 { - *val = 0.0; - } - }*/ - - // calculate truncated eigenvalue decomposition - let x = Array2::random((d.len(), embedding_size + 1), Uniform::new(0.0, 1.0)); - - let result = lobpcg::lobpcg(|y| { - let mut y = y.to_owned(); - y.genrows_mut().into_iter().zip(d.iter()).for_each(|(mut c, x)| c *= *x); - let mut y = kernel.apply_gram(y); - y.genrows_mut().into_iter().zip(d.iter()).for_each(|(mut c, x)| c *= *x); - - y - }, x, |_| {}, None, 1e-5, 20, TruncatedOrder::Largest); - - let (vals, vecs) = match result { - LobpcgResult::Ok(vals, vecs, _) | LobpcgResult::Err(vals, vecs, _, _) => (vals, vecs), - _ => panic!("Eigendecomposition failed!") + // use full eigenvalue decomposition for small problem sizes + let (vals, mut vecs) = if kernel.size() < 5 * embedding_size + 1 { + let mut matrix = kernel.apply_gram(Array2::from_diag(&d)); + matrix.genrows_mut().into_iter().zip(d.iter()).for_each(|(mut c, x)| c *= *x); + + let (vals, vecs) = matrix.eigh_into(UPLO::Lower).unwrap(); + let (vals, vecs) = (vals.slice_move(s![..; -1]), vecs.slice_move(s![.., ..; -1])); + ( + vals.slice_move(s![1..embedding_size+1]), + vecs.slice_move(s![..,1..embedding_size+1]) + ) + } else { + // calculate truncated eigenvalue decomposition + let x = Array2::random((d.len(), embedding_size + 1), Uniform::new(0.0, 1.0)); + + let result = lobpcg::lobpcg(|y| { + let mut y = y.to_owned(); + y.genrows_mut().into_iter().zip(d.iter()).for_each(|(mut c, x)| c *= *x); + let mut y = kernel.apply_gram(y); + y.genrows_mut().into_iter().zip(d.iter()).for_each(|(mut c, x)| c *= *x); + + y + }, x, |_| {}, None, 1e-5, 20, TruncatedOrder::Largest); + + let (vals, vecs) = match result { + LobpcgResult::Ok(vals, vecs, _) | LobpcgResult::Err(vals, vecs, _, _) => (vals, vecs), + _ => panic!("Eigendecomposition failed!") + }; + + // cut away first eigenvalue/eigenvector + ( + vals.slice_move(s![1..]), + vecs.slice_move(s![..,1..]) + ) }; - dbg!(&vals); - - // cut away first eigenvalue/eigenvector - let vals = vals.slice_move(s![1..]); - let mut vecs = vecs.slice_move(s![..,1..]); - let d = d.mapv(|x| 1.0/x); for (idx, elm) in vecs.indexed_iter_mut() { From 917ce646b222a0d95f9e03c0c8312d76407c663b Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Tue, 24 Mar 2020 14:17:09 +0100 Subject: [PATCH 16/24] Add simple dense gaussian kernel --- linfa-reduction/examples/diffusion_map.rs | 8 ++-- .../src/diffusion_map/algorithms.rs | 6 +-- linfa-reduction/src/kernel/gaussian.rs | 37 +++++++++++++++++++ linfa-reduction/src/kernel/mod.rs | 9 +++++ 4 files changed, 53 insertions(+), 7 deletions(-) create mode 100644 linfa-reduction/src/kernel/gaussian.rs diff --git a/linfa-reduction/examples/diffusion_map.rs b/linfa-reduction/examples/diffusion_map.rs index 9462df165..874697c61 100644 --- a/linfa-reduction/examples/diffusion_map.rs +++ b/linfa-reduction/examples/diffusion_map.rs @@ -1,5 +1,5 @@ use linfa_clustering::generate_blobs; -use linfa_reduction::{to_gaussian_similarity, DiffusionMap, DiffusionMapHyperParams}; +use linfa_reduction::{to_gaussian_similarity, DiffusionMap, DiffusionMapHyperParams, kernel::GaussianKernel}; use ndarray::array; use ndarray_npy::write_npy; use ndarray_rand::rand::SeedableRng; @@ -15,9 +15,9 @@ fn main() { let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],]; let n = 20; let dataset = generate_blobs(n, &expected_centroids, &mut rng); - let similarity = to_gaussian_similarity(&dataset, 40.0); + //let similarity = to_gaussian_similarity(&dataset, 40.0); - let kernel = GaussianKernel::new(dataset, 40); + let kernel = GaussianKernel::new(&dataset, 40.0); // Configure our training algorithm let hyperparams = DiffusionMapHyperParams::new(4) @@ -26,7 +26,7 @@ fn main() { // Infer an optimal set of centroids based on the training data distribution and assign optimal // indices to clusters - let diffusion_map = DiffusionMap::project(hyperparams, similarity); + let diffusion_map = DiffusionMap::project(hyperparams, kernel); dbg!(&diffusion_map.estimate_clusters()); let embedding = diffusion_map.embedding(); diff --git a/linfa-reduction/src/diffusion_map/algorithms.rs b/linfa-reduction/src/diffusion_map/algorithms.rs index 26fddc4ba..d9cf0a86a 100644 --- a/linfa-reduction/src/diffusion_map/algorithms.rs +++ b/linfa-reduction/src/diffusion_map/algorithms.rs @@ -2,7 +2,7 @@ use ndarray::{ArrayBase, Array1, Array2, Ix2, Axis, DataMut}; use ndarray_linalg::{TruncatedEig, TruncatedOrder, lobpcg::LobpcgResult, lobpcg, eigh::EighInto, lapack::UPLO, close_l2}; use ndarray_rand::{rand_distr::Uniform, RandomExt}; -use crate::kernel::Kernel; +use crate::kernel::{Kernel, IntoKernel}; use super::hyperparameters::DiffusionMapHyperParams; pub struct DiffusionMap { @@ -14,10 +14,10 @@ pub struct DiffusionMap { impl DiffusionMap { pub fn project( hyperparameters: DiffusionMapHyperParams, - kernel: impl Kernel + kernel: impl IntoKernel ) -> Self { // compute spectral embedding with diffusion map - let (embedding, eigvals) = compute_diffusion_map(kernel, hyperparameters.steps(), hyperparameters.embedding_size()); + let (embedding, eigvals) = compute_diffusion_map(kernel.into_kernel(), hyperparameters.steps(), hyperparameters.embedding_size()); DiffusionMap { hyperparameters, diff --git a/linfa-reduction/src/kernel/gaussian.rs b/linfa-reduction/src/kernel/gaussian.rs new file mode 100644 index 000000000..bfc6678ba --- /dev/null +++ b/linfa-reduction/src/kernel/gaussian.rs @@ -0,0 +1,37 @@ +use ndarray::{Array2, Axis}; +use crate::kernel::IntoKernel; + +pub struct GaussianKernel { + data: Array2 +} + +impl IntoKernel for GaussianKernel { + type IntoKer = Array2; + + fn into_kernel(self) -> Self::IntoKer { + self.data + } +} + +impl GaussianKernel { + pub fn new(dataset: &Array2, eps: f64) -> Self { + let n_observations = dataset.len_of(Axis(0)); + let mut similarity = Array2::eye(n_observations); + + for i in 0..n_observations { + for j in 0..n_observations { + let a = dataset.row(i); + let b = dataset.row(j); + + let distance = a.iter().zip(b.iter()).map(|(x,y)| (x-y).powf(2.0)) + .sum::(); + + similarity[(i,j)] = (-distance / eps).exp(); + } + } + + GaussianKernel { + data: similarity + } + } +} diff --git a/linfa-reduction/src/kernel/mod.rs b/linfa-reduction/src/kernel/mod.rs index 84c4131cb..f3781b7f0 100644 --- a/linfa-reduction/src/kernel/mod.rs +++ b/linfa-reduction/src/kernel/mod.rs @@ -1,3 +1,6 @@ +pub mod gaussian; +pub use gaussian::GaussianKernel; + use ndarray::prelude::*; use ndarray::Data; use ndarray::NdFloat; @@ -54,6 +57,12 @@ impl Kernel for CsMat { + type IntoKer: Kernel; + + fn into_kernel(self) -> Self::IntoKer; +} + #[cfg(test)] mod tests { use super::Kernel; From 15e5fe2d57379df5952c7908210e8ceec60803ac Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Wed, 1 Apr 2020 11:51:33 +0200 Subject: [PATCH 17/24] Update example --- linfa-reduction/examples/diffusion_map.rs | 13 +++- .../src/diffusion_map/algorithms.rs | 7 ++ linfa-reduction/src/kernel/gaussian.rs | 2 +- linfa-reduction/src/kernel/mod.rs | 74 +++++++++++++++++++ linfa-reduction/src/lib.rs | 7 +- linfa-reduction/src/pca/algorithms.rs | 8 ++ 6 files changed, 105 insertions(+), 6 deletions(-) diff --git a/linfa-reduction/examples/diffusion_map.rs b/linfa-reduction/examples/diffusion_map.rs index 874697c61..260ca7251 100644 --- a/linfa-reduction/examples/diffusion_map.rs +++ b/linfa-reduction/examples/diffusion_map.rs @@ -1,5 +1,6 @@ use linfa_clustering::generate_blobs; -use linfa_reduction::{to_gaussian_similarity, DiffusionMap, DiffusionMapHyperParams, kernel::GaussianKernel}; +use linfa_reduction::{Reduced, DiffusionMap, DiffusionMapHyperParams, Method, kernel::GaussianKernel}; +use linfa_reduction::kernel::{self, KernelWithMethod}; use ndarray::array; use ndarray_npy::write_npy; use ndarray_rand::rand::SeedableRng; @@ -17,17 +18,21 @@ fn main() { let dataset = generate_blobs(n, &expected_centroids, &mut rng); //let similarity = to_gaussian_similarity(&dataset, 40.0); - let kernel = GaussianKernel::new(&dataset, 40.0); + let diffusion_map = dataset + .kernel_with(kernel::Method::Gaussian { eps: 40.0 }) + .reduce_fixed(4, Method::DiffusionMap { steps: 10 }); + + //let kernel = GaussianKernel::new(&dataset, 40.0); // Configure our training algorithm - let hyperparams = DiffusionMapHyperParams::new(4) + /*let hyperparams = DiffusionMapHyperParams::new(4) .steps(1) .build(); // Infer an optimal set of centroids based on the training data distribution and assign optimal // indices to clusters let diffusion_map = DiffusionMap::project(hyperparams, kernel); - dbg!(&diffusion_map.estimate_clusters()); + dbg!(&diffusion_map.estimate_clusters());*/ let embedding = diffusion_map.embedding(); dbg!(&embedding); diff --git a/linfa-reduction/src/diffusion_map/algorithms.rs b/linfa-reduction/src/diffusion_map/algorithms.rs index d9cf0a86a..7d8b9c773 100644 --- a/linfa-reduction/src/diffusion_map/algorithms.rs +++ b/linfa-reduction/src/diffusion_map/algorithms.rs @@ -2,6 +2,7 @@ use ndarray::{ArrayBase, Array1, Array2, Ix2, Axis, DataMut}; use ndarray_linalg::{TruncatedEig, TruncatedOrder, lobpcg::LobpcgResult, lobpcg, eigh::EighInto, lapack::UPLO, close_l2}; use ndarray_rand::{rand_distr::Uniform, RandomExt}; +use crate::Reduced; use crate::kernel::{Kernel, IntoKernel}; use super::hyperparameters::DiffusionMapHyperParams; @@ -48,6 +49,12 @@ impl DiffusionMap { } } +impl Reduced for DiffusionMap { + fn embedding(&self) -> Array2 { + self.embedding.clone() + } +} + fn compute_diffusion_map(kernel: impl Kernel, steps: usize, embedding_size: usize) -> (Array2, Array1) { assert!(embedding_size < kernel.size()); diff --git a/linfa-reduction/src/kernel/gaussian.rs b/linfa-reduction/src/kernel/gaussian.rs index bfc6678ba..ad5471872 100644 --- a/linfa-reduction/src/kernel/gaussian.rs +++ b/linfa-reduction/src/kernel/gaussian.rs @@ -2,7 +2,7 @@ use ndarray::{Array2, Axis}; use crate::kernel::IntoKernel; pub struct GaussianKernel { - data: Array2 + pub data: Array2 } impl IntoKernel for GaussianKernel { diff --git a/linfa-reduction/src/kernel/mod.rs b/linfa-reduction/src/kernel/mod.rs index f3781b7f0..d8ac463bd 100644 --- a/linfa-reduction/src/kernel/mod.rs +++ b/linfa-reduction/src/kernel/mod.rs @@ -7,6 +7,9 @@ use ndarray::NdFloat; use num_traits::{FromPrimitive, Num}; use sprs::CsMat; +use crate::Reduced; +use crate::{DiffusionMap, DiffusionMapHyperParams, PrincipalComponentAnalysis}; + /// Symmetric kernel function, can be sparse or dense /// /// Required functions are: @@ -57,12 +60,83 @@ impl Kernel for CsMat { type IntoKer: Kernel; fn into_kernel(self) -> Self::IntoKer; } +pub trait KernelWithMethod { + type IntoKer: Kernel; + + fn kernel_with(&self, method: Method) -> Self::IntoKer; +} + +impl KernelWithMethod for Array2 { + type IntoKer = Kernels; + + fn kernel_with(&self, method: Method) -> Self::IntoKer { + match method { + Method::Gaussian { eps } => Kernels::Gaussian(GaussianKernel::new(&self, eps)), + } + } +} + + +pub enum Method { + Gaussian { eps: f64 }, +} + +pub enum Kernels { + Gaussian(GaussianKernel) +} + +impl Kernels { + pub fn reduce(self, embedding_size: usize, method: crate::Method) -> impl Reduced { + match method { + crate::Method::DiffusionMap { steps } => { + let params = DiffusionMapHyperParams::new(embedding_size) + .steps(steps) + .build(); + + DiffusionMap::project(params, self) + }, + crate::Method::PrincipalComponentAnalysis => { + PrincipalComponentAnalysis::fit(self, embedding_size) + } + } + } +} + +impl Kernel for Kernels { + fn apply_gram>(&self, rhs: ArrayBase) -> Array2 { + match self { + Kernels::Gaussian(x) => x.data.apply_gram(rhs), + } + } + + fn sum(&self) -> Array1 { + match self { + Kernels::Gaussian(x) => Kernel::sum(&x.data), + } + } + + fn size(&self) -> usize { + match self { + Kernels::Gaussian(x) => x.data.size(), + } + } +} + +impl IntoKernel for Kernels { + type IntoKer = Kernels; + + fn into_kernel(self) -> Self::IntoKer { + self + } +} + #[cfg(test)] mod tests { use super::Kernel; diff --git a/linfa-reduction/src/lib.rs b/linfa-reduction/src/lib.rs index cb5aea654..40aba1384 100644 --- a/linfa-reduction/src/lib.rs +++ b/linfa-reduction/src/lib.rs @@ -5,11 +5,16 @@ pub mod diffusion_map; pub mod pca; pub mod utils; +use ndarray::Array2; pub use pca::PrincipalComponentAnalysis; pub use diffusion_map::{DiffusionMap, DiffusionMapHyperParams}; pub use utils::to_gaussian_similarity; pub enum Method { - DiffusionMap, + DiffusionMap { steps: usize }, PrincipalComponentAnalysis } + +pub trait Reduced { + fn embedding(&self) -> Array2; +} diff --git a/linfa-reduction/src/pca/algorithms.rs b/linfa-reduction/src/pca/algorithms.rs index 67da32159..766f15572 100644 --- a/linfa-reduction/src/pca/algorithms.rs +++ b/linfa-reduction/src/pca/algorithms.rs @@ -1,6 +1,8 @@ use ndarray::{ArrayBase, Array1, Array2, Ix2, Axis, DataMut}; use ndarray_linalg::{TruncatedSvd, TruncatedOrder}; +use crate::Reduced; + pub struct PrincipalComponentAnalysis { embedding: Array2, explained_variance: Array1, @@ -41,3 +43,9 @@ impl PrincipalComponentAnalysis { self.explained_variance.clone() } } + +impl Reduced for PrincipalComponentAnalysis { + fn embedding(&self) -> Array2 { + self.embedding.clone() + } +} From 7d48a78337ad30da649ab621e389c85cc081d6fc Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Thu, 2 Apr 2020 11:52:19 +0200 Subject: [PATCH 18/24] Add dot kernel --- linfa-reduction/examples/diffusion_map.rs | 24 ++-- .../src/diffusion_map/algorithms.rs | 67 +++++----- linfa-reduction/src/kernel/dot.rs | 40 ++++++ linfa-reduction/src/kernel/gaussian.rs | 34 ++--- linfa-reduction/src/kernel/mod.rs | 117 +++++------------- linfa-reduction/src/lib.rs | 21 ++-- linfa-reduction/src/pca/algorithms.rs | 8 -- 7 files changed, 145 insertions(+), 166 deletions(-) create mode 100644 linfa-reduction/src/kernel/dot.rs diff --git a/linfa-reduction/examples/diffusion_map.rs b/linfa-reduction/examples/diffusion_map.rs index 260ca7251..16b183d9e 100644 --- a/linfa-reduction/examples/diffusion_map.rs +++ b/linfa-reduction/examples/diffusion_map.rs @@ -1,6 +1,6 @@ use linfa_clustering::generate_blobs; -use linfa_reduction::{Reduced, DiffusionMap, DiffusionMapHyperParams, Method, kernel::GaussianKernel}; -use linfa_reduction::kernel::{self, KernelWithMethod}; +use linfa_reduction::{DiffusionMap, DiffusionMapHyperParams}; +use linfa_reduction::kernel::{IntoKernel, DotKernel}; use ndarray::array; use ndarray_npy::write_npy; use ndarray_rand::rand::SeedableRng; @@ -14,25 +14,15 @@ fn main() { // For each our expected centroids, generate `n` data points around it (a "blob") let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],]; - let n = 20; + let n = 30; let dataset = generate_blobs(n, &expected_centroids, &mut rng); //let similarity = to_gaussian_similarity(&dataset, 40.0); - let diffusion_map = dataset - .kernel_with(kernel::Method::Gaussian { eps: 40.0 }) - .reduce_fixed(4, Method::DiffusionMap { steps: 10 }); + //let diffusion_map = GaussianKernel::new(&dataset, 40.0) + //.reduce_fixed(4); - //let kernel = GaussianKernel::new(&dataset, 40.0); - - // Configure our training algorithm - /*let hyperparams = DiffusionMapHyperParams::new(4) - .steps(1) - .build(); - - // Infer an optimal set of centroids based on the training data distribution and assign optimal - // indices to clusters - let diffusion_map = DiffusionMap::project(hyperparams, kernel); - dbg!(&diffusion_map.estimate_clusters());*/ + let diffusion_map = DotKernel::new(dataset.clone()) + .reduce_fixed(1); let embedding = diffusion_map.embedding(); dbg!(&embedding); diff --git a/linfa-reduction/src/diffusion_map/algorithms.rs b/linfa-reduction/src/diffusion_map/algorithms.rs index 7d8b9c773..3e2e46486 100644 --- a/linfa-reduction/src/diffusion_map/algorithms.rs +++ b/linfa-reduction/src/diffusion_map/algorithms.rs @@ -1,24 +1,25 @@ -use ndarray::{ArrayBase, Array1, Array2, Ix2, Axis, DataMut}; -use ndarray_linalg::{TruncatedEig, TruncatedOrder, lobpcg::LobpcgResult, lobpcg, eigh::EighInto, lapack::UPLO, close_l2}; +use ndarray::{Array1, Array2}; +use ndarray_linalg::{TruncatedOrder, lobpcg::LobpcgResult, lobpcg, eigh::EighInto, lapack::UPLO, Scalar}; use ndarray_rand::{rand_distr::Uniform, RandomExt}; +use num_traits::NumCast; -use crate::Reduced; +use crate::Float; use crate::kernel::{Kernel, IntoKernel}; use super::hyperparameters::DiffusionMapHyperParams; -pub struct DiffusionMap { +pub struct DiffusionMap { hyperparameters: DiffusionMapHyperParams, - embedding: Array2, - eigvals: Array1 + embedding: Array2, + eigvals: Array1 } -impl DiffusionMap { +impl DiffusionMap { pub fn project( hyperparameters: DiffusionMapHyperParams, - kernel: impl IntoKernel + kernel: impl IntoKernel ) -> Self { // compute spectral embedding with diffusion map - let (embedding, eigvals) = compute_diffusion_map(kernel.into_kernel(), hyperparameters.steps(), hyperparameters.embedding_size()); + let (embedding, eigvals) = compute_diffusion_map(kernel.into_kernel(), hyperparameters.steps(), 0.0, hyperparameters.embedding_size(), None); DiffusionMap { hyperparameters, @@ -34,57 +35,52 @@ impl DiffusionMap { /// Estimate the number of clusters in this embedding (very crude for now) pub fn estimate_clusters(&self) -> usize { - let mean = self.eigvals.sum() / self.eigvals.len() as f64; + let mean = self.eigvals.sum() / NumCast::from(self.eigvals.len()).unwrap(); self.eigvals.iter().filter(|x| *x > &mean).count() + 1 } /// Return a copy of the eigenvalues - pub fn eigvals(&self) -> Array1 { + pub fn eigvals(&self) -> Array1 { self.eigvals.clone() } - /// Return the embedding - pub fn embedding(self) -> Array2 { - self.embedding - } -} - -impl Reduced for DiffusionMap { - fn embedding(&self) -> Array2 { + pub fn embedding(&self) -> Array2 { self.embedding.clone() } } -fn compute_diffusion_map(kernel: impl Kernel, steps: usize, embedding_size: usize) -> (Array2, Array1) { +fn compute_diffusion_map(kernel: impl Kernel, steps: usize, alpha: f32, embedding_size: usize, guess: Option>) -> (Array2, Array1) { assert!(embedding_size < kernel.size()); - // calculate sum of rows let d = kernel.sum() - .mapv(|x| 1.0/x.sqrt()); + .mapv(|x| x.recip().sqrt()); // use full eigenvalue decomposition for small problem sizes let (vals, mut vecs) = if kernel.size() < 5 * embedding_size + 1 { - let mut matrix = kernel.apply_gram(Array2::from_diag(&d)); - matrix.genrows_mut().into_iter().zip(d.iter()).for_each(|(mut c, x)| c *= *x); + let mut matrix = kernel.mul_similarity(&Array2::from_diag(&d).view()); + matrix.gencolumns_mut().into_iter().zip(d.iter()).for_each(|(mut a,b)| a *= *b); let (vals, vecs) = matrix.eigh_into(UPLO::Lower).unwrap(); let (vals, vecs) = (vals.slice_move(s![..; -1]), vecs.slice_move(s![.., ..; -1])); ( - vals.slice_move(s![1..embedding_size+1]), + vals.slice_move(s![1..embedding_size+1]).mapv(|x| Scalar::from_real(x)), vecs.slice_move(s![..,1..embedding_size+1]) ) } else { // calculate truncated eigenvalue decomposition - let x = Array2::random((d.len(), embedding_size + 1), Uniform::new(0.0, 1.0)); + let x = guess.unwrap_or( + Array2::random((kernel.size(), embedding_size + 1), Uniform::new(0.0f64, 1.0)) + .mapv(|x| NumCast::from(x).unwrap()) + ); let result = lobpcg::lobpcg(|y| { let mut y = y.to_owned(); - y.genrows_mut().into_iter().zip(d.iter()).for_each(|(mut c, x)| c *= *x); - let mut y = kernel.apply_gram(y); - y.genrows_mut().into_iter().zip(d.iter()).for_each(|(mut c, x)| c *= *x); + y.genrows_mut().into_iter().zip(d.iter()).for_each(|(mut a,b)| a *= *b); + let mut y = kernel.mul_similarity(&y.view()); + y.genrows_mut().into_iter().zip(d.iter()).for_each(|(mut a,b)| a *= *b); y - }, x, |_| {}, None, 1e-5, 20, TruncatedOrder::Largest); + }, x, |_| {}, None, 1e-5, 15, TruncatedOrder::Largest); let (vals, vecs) = match result { LobpcgResult::Ok(vals, vecs, _) | LobpcgResult::Err(vals, vecs, _, _) => (vals, vecs), @@ -97,15 +93,16 @@ fn compute_diffusion_map(kernel: impl Kernel, steps: usize, embedding_size: vecs.slice_move(s![..,1..]) ) }; - let d = d.mapv(|x| 1.0/x); - for (idx, elm) in vecs.indexed_iter_mut() { - let (row, _) = idx; - *elm *= d[row]; + let d = d.mapv(|x| x.recip()); + + for (mut col, val) in vecs.gencolumns_mut().into_iter().zip(d.iter()) { + col *= *val; } + let steps = NumCast::from(steps).unwrap(); for (mut vec, val) in vecs.gencolumns_mut().into_iter().zip(vals.iter()) { - vec *= val.powf(steps as f64); + vec *= val.powf(steps); } (vecs, vals) diff --git a/linfa-reduction/src/kernel/dot.rs b/linfa-reduction/src/kernel/dot.rs new file mode 100644 index 000000000..a95716eed --- /dev/null +++ b/linfa-reduction/src/kernel/dot.rs @@ -0,0 +1,40 @@ +use ndarray::{Array1, ArrayView2, Array2}; +use crate::Float; +use crate::kernel::{Kernel, IntoKernel}; + +pub struct DotKernel { + data: Array2 +} + +impl DotKernel { + pub fn new(data: Array2) -> DotKernel { + DotKernel { data } + } +} + +impl Kernel for DotKernel { + fn mul_similarity(&self, rhs: &ArrayView2) -> Array2 { + if self.data.ncols() > self.data.nrows() { + self.data.dot(&self.data.t().dot(rhs)) + } else { + self.data.t().dot(&self.data.dot(rhs)) + } + } + + fn size(&self) -> usize { + usize::min(self.data.ncols(), self.data.nrows()) + } + + fn sum(&self) -> Array1 { + self.mul_similarity(&Array2::ones((self.size(), 1)).view()) + .into_shape(self.size()).unwrap() + } +} + +impl IntoKernel for DotKernel { + type IntoKer = DotKernel; + + fn into_kernel(self) -> Self::IntoKer { + self + } +} diff --git a/linfa-reduction/src/kernel/gaussian.rs b/linfa-reduction/src/kernel/gaussian.rs index ad5471872..a36d6d00a 100644 --- a/linfa-reduction/src/kernel/gaussian.rs +++ b/linfa-reduction/src/kernel/gaussian.rs @@ -1,20 +1,18 @@ -use ndarray::{Array2, Axis}; +use num_traits::NumCast; +use ndarray::{Array2, Axis, NdFloat}; +use std::iter::Sum; + +use crate::Float; use crate::kernel::IntoKernel; -pub struct GaussianKernel { - pub data: Array2 +pub struct GaussianKernel { + pub data: Array2 } -impl IntoKernel for GaussianKernel { - type IntoKer = Array2; - - fn into_kernel(self) -> Self::IntoKer { - self.data - } -} +impl> GaussianKernel { + pub fn new(dataset: &Array2, eps: f32) -> Self { + let eps = NumCast::from(eps).unwrap(); -impl GaussianKernel { - pub fn new(dataset: &Array2, eps: f64) -> Self { let n_observations = dataset.len_of(Axis(0)); let mut similarity = Array2::eye(n_observations); @@ -23,8 +21,8 @@ impl GaussianKernel { let a = dataset.row(i); let b = dataset.row(j); - let distance = a.iter().zip(b.iter()).map(|(x,y)| (x-y).powf(2.0)) - .sum::(); + let distance = a.iter().zip(b.iter()).map(|(x,y)| (*x-*y)*(*x-*y)) + .sum::(); similarity[(i,j)] = (-distance / eps).exp(); } @@ -35,3 +33,11 @@ impl GaussianKernel { } } } + +impl IntoKernel for GaussianKernel { + type IntoKer = Array2; + + fn into_kernel(self) -> Self::IntoKer { + self.data + } +} diff --git a/linfa-reduction/src/kernel/mod.rs b/linfa-reduction/src/kernel/mod.rs index d8ac463bd..658bf5779 100644 --- a/linfa-reduction/src/kernel/mod.rs +++ b/linfa-reduction/src/kernel/mod.rs @@ -1,34 +1,38 @@ +pub mod dot; pub mod gaussian; +pub use dot::DotKernel; pub use gaussian::GaussianKernel; +use std::iter::Sum; + use ndarray::prelude::*; use ndarray::Data; -use ndarray::NdFloat; -use num_traits::{FromPrimitive, Num}; use sprs::CsMat; -use crate::Reduced; -use crate::{DiffusionMap, DiffusionMapHyperParams, PrincipalComponentAnalysis}; +use crate::Float; +use crate::diffusion_map::{DiffusionMapHyperParams, DiffusionMap}; +use crate::pca::PrincipalComponentAnalysis; /// Symmetric kernel function, can be sparse or dense /// /// Required functions are: -/// * `apply_gram`: performs a matrix multiplication with `rhs` -/// * `sum`: returns the sum of columns or rows +/// * `mul_normalized_similarity`: performs a matrix multiplication with `rhs`, which is a +/// normalized (sum of columns and rows equal one) symmetric similarity matrix /// * `size`: returns the number of data-points -pub trait Kernel { - fn apply_gram>(&self, rhs: ArrayBase) -> Array2; +pub trait Kernel { + fn mul_similarity(&self, rhs: &ArrayView2) -> Array2; fn sum(&self) -> Array1; fn size(&self) -> usize; + } -impl, A: NdFloat + 'static + FromPrimitive> Kernel for ArrayBase { - fn apply_gram>(&self, rhs: ArrayBase) -> Array2 { - self.dot(&rhs) +impl, A: Float> Kernel for ArrayBase { + fn mul_similarity(&self, rhs: &ArrayView2) -> Array2 { + self.dot(rhs) } fn sum(&self) -> Array1 { - self.sum_axis(ndarray::Axis(0)) + self.sum_axis(Axis(1)) } fn size(&self) -> usize { @@ -38,19 +42,20 @@ impl, A: NdFloat + 'static + FromPrimitive> Kernel for Arra } } -impl Kernel for CsMat { - fn apply_gram>(&self, rhs: ArrayBase) -> Array2 { - self * &rhs +impl Kernel for CsMat { + fn mul_similarity(&self, rhs: &ArrayView2) -> Array2 { + self * rhs } fn sum(&self) -> Array1 { - let mut mean = Array1::zeros(self.cols()); + let mut sum = Array1::zeros(self.cols()); - for (val, (_, col)) in self.iter() { - mean[col] += *val; + for (val, i) in self.iter() { + let (_, col) = i; + sum[col] += *val; } - mean + sum } fn size(&self) -> usize { @@ -61,80 +66,24 @@ impl Kernel for CsMat { +pub trait IntoKernel { type IntoKer: Kernel; fn into_kernel(self) -> Self::IntoKer; -} - -pub trait KernelWithMethod { - type IntoKer: Kernel; - fn kernel_with(&self, method: Method) -> Self::IntoKer; -} - -impl KernelWithMethod for Array2 { - type IntoKer = Kernels; + fn reduce_fixed(self, embedding_size: usize) -> DiffusionMap + where Self: Sized + { + let params = DiffusionMapHyperParams::new(embedding_size) + .steps(1) + .build(); - fn kernel_with(&self, method: Method) -> Self::IntoKer { - match method { - Method::Gaussian { eps } => Kernels::Gaussian(GaussianKernel::new(&self, eps)), - } + DiffusionMap::project(params, self) } } - pub enum Method { - Gaussian { eps: f64 }, -} - -pub enum Kernels { - Gaussian(GaussianKernel) -} - -impl Kernels { - pub fn reduce(self, embedding_size: usize, method: crate::Method) -> impl Reduced { - match method { - crate::Method::DiffusionMap { steps } => { - let params = DiffusionMapHyperParams::new(embedding_size) - .steps(steps) - .build(); - - DiffusionMap::project(params, self) - }, - crate::Method::PrincipalComponentAnalysis => { - PrincipalComponentAnalysis::fit(self, embedding_size) - } - } - } -} - -impl Kernel for Kernels { - fn apply_gram>(&self, rhs: ArrayBase) -> Array2 { - match self { - Kernels::Gaussian(x) => x.data.apply_gram(rhs), - } - } - - fn sum(&self) -> Array1 { - match self { - Kernels::Gaussian(x) => Kernel::sum(&x.data), - } - } - - fn size(&self) -> usize { - match self { - Kernels::Gaussian(x) => x.data.size(), - } - } -} - -impl IntoKernel for Kernels { - type IntoKer = Kernels; - - fn into_kernel(self) -> Self::IntoKer { - self - } + Gaussian { eps: f32 }, } #[cfg(test)] diff --git a/linfa-reduction/src/lib.rs b/linfa-reduction/src/lib.rs index 40aba1384..2086e3567 100644 --- a/linfa-reduction/src/lib.rs +++ b/linfa-reduction/src/lib.rs @@ -5,16 +5,21 @@ pub mod diffusion_map; pub mod pca; pub mod utils; -use ndarray::Array2; -pub use pca::PrincipalComponentAnalysis; pub use diffusion_map::{DiffusionMap, DiffusionMapHyperParams}; pub use utils::to_gaussian_similarity; -pub enum Method { - DiffusionMap { steps: usize }, - PrincipalComponentAnalysis -} +use ndarray::NdFloat; +use ndarray_linalg::Lapack; +use num_traits::FromPrimitive; -pub trait Reduced { - fn embedding(&self) -> Array2; +pub trait Float: + NdFloat + + Lapack + + Default + + Clone + + FromPrimitive +{ } + +impl Float for f32 {} +impl Float for f64 {} diff --git a/linfa-reduction/src/pca/algorithms.rs b/linfa-reduction/src/pca/algorithms.rs index 766f15572..67da32159 100644 --- a/linfa-reduction/src/pca/algorithms.rs +++ b/linfa-reduction/src/pca/algorithms.rs @@ -1,8 +1,6 @@ use ndarray::{ArrayBase, Array1, Array2, Ix2, Axis, DataMut}; use ndarray_linalg::{TruncatedSvd, TruncatedOrder}; -use crate::Reduced; - pub struct PrincipalComponentAnalysis { embedding: Array2, explained_variance: Array1, @@ -43,9 +41,3 @@ impl PrincipalComponentAnalysis { self.explained_variance.clone() } } - -impl Reduced for PrincipalComponentAnalysis { - fn embedding(&self) -> Array2 { - self.embedding.clone() - } -} From a86b70d30393dcda713f298f74a8ce046c423866 Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Fri, 3 Apr 2020 12:02:16 +0200 Subject: [PATCH 19/24] Add sparse gaussian kernel with `hnsw` --- linfa-reduction/Cargo.toml | 2 + linfa-reduction/src/kernel/mod.rs | 2 + linfa-reduction/src/kernel/sparse_gaussian.rs | 88 +++++++++++++++++++ 3 files changed, 92 insertions(+) create mode 100644 linfa-reduction/src/kernel/sparse_gaussian.rs diff --git a/linfa-reduction/Cargo.toml b/linfa-reduction/Cargo.toml index 18c693ff7..35306e51f 100644 --- a/linfa-reduction/Cargo.toml +++ b/linfa-reduction/Cargo.toml @@ -20,6 +20,8 @@ ndarray-linalg = { path = "../../ndarray-linalg", features = ["openblas"] } sprs = "0.7" serde = { version = "1", features = ["derive"] } num-traits = "0.1.32" +hnsw = "0.6" +space = "0.10" [dev-dependencies] rand_isaac = "0.2.0" diff --git a/linfa-reduction/src/kernel/mod.rs b/linfa-reduction/src/kernel/mod.rs index 658bf5779..d59ab1a86 100644 --- a/linfa-reduction/src/kernel/mod.rs +++ b/linfa-reduction/src/kernel/mod.rs @@ -1,7 +1,9 @@ pub mod dot; pub mod gaussian; +pub mod sparse_gaussian; pub use dot::DotKernel; pub use gaussian::GaussianKernel; +pub use sparse_gaussian::SparseGaussianKernel; use std::iter::Sum; diff --git a/linfa-reduction/src/kernel/sparse_gaussian.rs b/linfa-reduction/src/kernel/sparse_gaussian.rs new file mode 100644 index 000000000..fa8bc3879 --- /dev/null +++ b/linfa-reduction/src/kernel/sparse_gaussian.rs @@ -0,0 +1,88 @@ +use sprs::{CsMat, CsMatBase}; +use hnsw::{Searcher, HNSW}; +use space::{Neighbor, MetricPoint}; +use ndarray::{Array2, Axis, ArrayView1}; + +use num_traits::NumCast; + +use crate::Float; +use crate::kernel::IntoKernel; + +struct Euclidean<'a, A>(ArrayView1<'a, A>); + +impl MetricPoint for Euclidean<'_, A> { + fn distance(&self, rhs: &Self) -> u32 { + let val = self.0 + .iter() + .zip(rhs.0.iter()) + .map(|(&a, &b)| (a - b) * (a - b)) + .sum::() + .sqrt(); + + space::f32_metric(val.to_f32().unwrap()) + } +} + +pub struct SparseGaussianKernel { + data: CsMat +} + +impl SparseGaussianKernel { + pub fn new(dataset: &Array2, k: usize) -> Self { + let data = find_k_nearest_neighbours(dataset, k); + + SparseGaussianKernel { data } + } +} + +impl IntoKernel for SparseGaussianKernel { + type IntoKer = CsMat; + + fn into_kernel(self) -> Self::IntoKer { + self.data + } +} + +pub fn find_k_nearest_neighbours(dataset: &Array2, k: usize) -> CsMat { + let n_points = dataset.len_of(Axis(1)); + + // ensure that the number of neighbours is at least one and less than the total number of + // points + assert!(k < n_points); + assert!(k > 0); + + let mut searcher = Searcher::default(); + let mut hnsw: HNSW> = HNSW::new(); + + // insert all columns as data points into HNSW graph + for feature in dataset.gencolumns().into_iter() { + hnsw.insert(Euclidean(feature), &mut searcher); + } + + // allocate buffer for k neighbours (plus the points itself) + let mut neighbours = vec![Neighbor::invalid(); k + 1]; + + // allocate buffer to initialize the sparse matrix later on + // * data: we have exact #points * k positive entries + // * indptr: has structure [0,k,2k,...,#points*k] + // * indices: filled with the nearest indices + let data = vec![NumCast::from(1.0).unwrap(); n_points * k]; + let indptr = (0..n_points+1).map(|x| x * k).collect::>(); + let mut indices = Vec::with_capacity(n_points * k); + + // find neighbours for each data point + for feature in dataset.gencolumns().into_iter() { + hnsw.nearest(&Euclidean(feature), 3 * k, &mut searcher, &mut neighbours); + + // sort by indices + neighbours.sort_unstable(); + + // push each index into the indices array + for n in &neighbours { + indices.push(n.index); + } + } + + // create CSR matrix from data, indptr and indices + CsMatBase::new((n_points, n_points), indptr, indices, data) +} From 7235a950ef4803f094945857bcf1b3be244bbe69 Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Sat, 4 Apr 2020 18:59:40 +0200 Subject: [PATCH 20/24] Fix sparse gaussian kernel --- linfa-reduction/examples/diffusion_map.rs | 8 +-- linfa-reduction/src/kernel/sparse_gaussian.rs | 64 +++++++++++++++---- 2 files changed, 56 insertions(+), 16 deletions(-) diff --git a/linfa-reduction/examples/diffusion_map.rs b/linfa-reduction/examples/diffusion_map.rs index 16b183d9e..9264b8e37 100644 --- a/linfa-reduction/examples/diffusion_map.rs +++ b/linfa-reduction/examples/diffusion_map.rs @@ -1,6 +1,6 @@ use linfa_clustering::generate_blobs; use linfa_reduction::{DiffusionMap, DiffusionMapHyperParams}; -use linfa_reduction::kernel::{IntoKernel, DotKernel}; +use linfa_reduction::kernel::{IntoKernel, DotKernel, SparseGaussianKernel}; use ndarray::array; use ndarray_npy::write_npy; use ndarray_rand::rand::SeedableRng; @@ -14,15 +14,15 @@ fn main() { // For each our expected centroids, generate `n` data points around it (a "blob") let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],]; - let n = 30; + let n = 10; let dataset = generate_blobs(n, &expected_centroids, &mut rng); //let similarity = to_gaussian_similarity(&dataset, 40.0); //let diffusion_map = GaussianKernel::new(&dataset, 40.0) //.reduce_fixed(4); - let diffusion_map = DotKernel::new(dataset.clone()) - .reduce_fixed(1); + let diffusion_map = SparseGaussianKernel::new(&dataset.clone(), 4) + .reduce_fixed(2); let embedding = diffusion_map.embedding(); dbg!(&embedding); diff --git a/linfa-reduction/src/kernel/sparse_gaussian.rs b/linfa-reduction/src/kernel/sparse_gaussian.rs index fa8bc3879..172f3ffc6 100644 --- a/linfa-reduction/src/kernel/sparse_gaussian.rs +++ b/linfa-reduction/src/kernel/sparse_gaussian.rs @@ -1,5 +1,5 @@ use sprs::{CsMat, CsMatBase}; -use hnsw::{Searcher, HNSW}; +use hnsw::{Searcher, HNSW, Params}; use space::{Neighbor, MetricPoint}; use ndarray::{Array2, Axis, ArrayView1}; @@ -29,7 +29,19 @@ pub struct SparseGaussianKernel { impl SparseGaussianKernel { pub fn new(dataset: &Array2, k: usize) -> Self { - let data = find_k_nearest_neighbours(dataset, k); + let mut data = find_k_nearest_neighbours(dataset, k); + + for (i, mut vec) in data.outer_iterator_mut().enumerate() { + for (j, mut val) in vec.iter_mut() { + let a = dataset.row(i); + let b = dataset.row(j); + + let distance = a.iter().zip(b.iter()).map(|(x,y)| (*x-*y)*(*x-*y)) + .sum::(); + + *val = (-distance / NumCast::from(60.0).unwrap()).exp(); + } + } SparseGaussianKernel { data } } @@ -44,18 +56,21 @@ impl IntoKernel for SparseGaussianKernel { } pub fn find_k_nearest_neighbours(dataset: &Array2, k: usize) -> CsMat { - let n_points = dataset.len_of(Axis(1)); + let n_points = dataset.len_of(Axis(0)); // ensure that the number of neighbours is at least one and less than the total number of // points assert!(k < n_points); assert!(k > 0); + let params = Params::new() + .ef_construction(100); + let mut searcher = Searcher::default(); - let mut hnsw: HNSW> = HNSW::new(); + let mut hnsw: HNSW> = HNSW::new_params(params); - // insert all columns as data points into HNSW graph - for feature in dataset.gencolumns().into_iter() { + // insert all rows as data points into HNSW graph + for feature in dataset.genrows().into_iter() { hnsw.insert(Euclidean(feature), &mut searcher); } @@ -66,23 +81,48 @@ pub fn find_k_nearest_neighbours(dataset: &Array2, k: usize) -> CsM // * data: we have exact #points * k positive entries // * indptr: has structure [0,k,2k,...,#points*k] // * indices: filled with the nearest indices - let data = vec![NumCast::from(1.0).unwrap(); n_points * k]; - let indptr = (0..n_points+1).map(|x| x * k).collect::>(); - let mut indices = Vec::with_capacity(n_points * k); + let mut data = Vec::with_capacity(n_points * (k+1)); + let mut indptr = Vec::with_capacity(n_points + 1); + //let indptr = (0..n_points+1).map(|x| x * (k+1)).collect::>(); + let mut indices = Vec::with_capacity(n_points * (k+1)); + indptr.push(0); // find neighbours for each data point - for feature in dataset.gencolumns().into_iter() { + let mut added = 0; + for (m, feature) in dataset.genrows().into_iter().enumerate() { hnsw.nearest(&Euclidean(feature), 3 * k, &mut searcher, &mut neighbours); + //dbg!(&neighbours); + // sort by indices neighbours.sort_unstable(); + indices.push(m); + data.push(NumCast::from(1.0).unwrap()); + added += 1; + // push each index into the indices array for n in &neighbours { - indices.push(n.index); + if m < n.index { + indices.push(n.index); + data.push(NumCast::from(1.0).unwrap()); + added += 1; + } } + + indptr.push(added); } + // create CSR matrix from data, indptr and indices - CsMatBase::new((n_points, n_points), indptr, indices, data) + let mat = CsMatBase::new((n_points, n_points), indptr, indices, data); + let mut mat = &mat + &mat.transpose_view(); + //dbg!(mat.to_dense()); + + let val: A = NumCast::from(1.0).unwrap(); + for i in 0..(n_points) { + mat.set(i, i, val); + } + + mat } From 133b6122c14542255e4b7dc6fa2298f405b4e7d8 Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Tue, 7 Apr 2020 21:30:00 +0200 Subject: [PATCH 21/24] Ensure that sparse kernel has elements for each point --- linfa-reduction/examples/diffusion_map.rs | 19 ++++++------- .../src/diffusion_map/algorithms.rs | 25 +++++++++++------ linfa-reduction/src/kernel/sparse_gaussian.rs | 8 +++--- linfa-reduction/src/utils.rs | 27 +++++++++++++++++++ 4 files changed, 58 insertions(+), 21 deletions(-) diff --git a/linfa-reduction/examples/diffusion_map.rs b/linfa-reduction/examples/diffusion_map.rs index 9264b8e37..da9352a2d 100644 --- a/linfa-reduction/examples/diffusion_map.rs +++ b/linfa-reduction/examples/diffusion_map.rs @@ -1,6 +1,6 @@ -use linfa_clustering::generate_blobs; use linfa_reduction::{DiffusionMap, DiffusionMapHyperParams}; -use linfa_reduction::kernel::{IntoKernel, DotKernel, SparseGaussianKernel}; +use linfa_reduction::utils::generate_swissroll; +use linfa_reduction::kernel::{IntoKernel, DotKernel, SparseGaussianKernel, GaussianKernel}; use ndarray::array; use ndarray_npy::write_npy; use ndarray_rand::rand::SeedableRng; @@ -13,19 +13,20 @@ fn main() { let mut rng = Isaac64Rng::seed_from_u64(42); // For each our expected centroids, generate `n` data points around it (a "blob") - let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],]; - let n = 10; - let dataset = generate_blobs(n, &expected_centroids, &mut rng); + let n = 2000; + let dataset = generate_swissroll(40.0, 1.0, n, &mut rng); + //let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],]; + //let dataset = generate_blobs(n, &expected_centroids, &mut rng); //let similarity = to_gaussian_similarity(&dataset, 40.0); - //let diffusion_map = GaussianKernel::new(&dataset, 40.0) - //.reduce_fixed(4); + //let diffusion_map = GaussianKernel::new(&dataset, 100.0) + // .reduce_fixed(2); - let diffusion_map = SparseGaussianKernel::new(&dataset.clone(), 4) + let diffusion_map = SparseGaussianKernel::new(&dataset.clone(), 20, 100.0) .reduce_fixed(2); let embedding = diffusion_map.embedding(); - dbg!(&embedding); + //dbg!(&embedding); // Save to disk our dataset (and the cluster label assigned to each observation) // We use the `npy` format for compatibility with NumPy diff --git a/linfa-reduction/src/diffusion_map/algorithms.rs b/linfa-reduction/src/diffusion_map/algorithms.rs index 3e2e46486..23b0f4883 100644 --- a/linfa-reduction/src/diffusion_map/algorithms.rs +++ b/linfa-reduction/src/diffusion_map/algorithms.rs @@ -19,7 +19,7 @@ impl DiffusionMap { kernel: impl IntoKernel ) -> Self { // compute spectral embedding with diffusion map - let (embedding, eigvals) = compute_diffusion_map(kernel.into_kernel(), hyperparameters.steps(), 0.0, hyperparameters.embedding_size(), None); + let (embedding, eigvals) = compute_diffusion_map(kernel.into_kernel(), hyperparameters.steps(), 1.0, hyperparameters.embedding_size(), None); DiffusionMap { hyperparameters, @@ -53,7 +53,9 @@ fn compute_diffusion_map(kernel: impl Kernel, steps: usize, alpha: assert!(embedding_size < kernel.size()); let d = kernel.sum() - .mapv(|x| x.recip().sqrt()); + .mapv(|x| x.recip()); + + let d2 = d.mapv(|x| x.powf(NumCast::from(0.5 + alpha).unwrap())); // use full eigenvalue decomposition for small problem sizes let (vals, mut vecs) = if kernel.size() < 5 * embedding_size + 1 { @@ -75,18 +77,19 @@ fn compute_diffusion_map(kernel: impl Kernel, steps: usize, alpha: let result = lobpcg::lobpcg(|y| { let mut y = y.to_owned(); - y.genrows_mut().into_iter().zip(d.iter()).for_each(|(mut a,b)| a *= *b); + y.genrows_mut().into_iter().zip(d2.iter()).for_each(|(mut a,b)| a *= *b); let mut y = kernel.mul_similarity(&y.view()); - y.genrows_mut().into_iter().zip(d.iter()).for_each(|(mut a,b)| a *= *b); + y.genrows_mut().into_iter().zip(d2.iter()).for_each(|(mut a,b)| a *= *b); y - }, x, |_| {}, None, 1e-5, 15, TruncatedOrder::Largest); + }, x, |_| {}, None, 1e-15, 200, TruncatedOrder::Largest); let (vals, vecs) = match result { - LobpcgResult::Ok(vals, vecs, _) | LobpcgResult::Err(vals, vecs, _, _) => (vals, vecs), + LobpcgResult::Ok(vals, vecs, n) | LobpcgResult::Err(vals, vecs, n, _) => { dbg!(&n); (vals, vecs)}, _ => panic!("Eigendecomposition failed!") }; + // cut away first eigenvalue/eigenvector ( vals.slice_move(s![1..]), @@ -94,12 +97,18 @@ fn compute_diffusion_map(kernel: impl Kernel, steps: usize, alpha: ) }; - let d = d.mapv(|x| x.recip()); + //let d = d.mapv(|x| x.recip()); - for (mut col, val) in vecs.gencolumns_mut().into_iter().zip(d.iter()) { + let d = d.mapv(|x| x.sqrt()); + + for (mut col, val) in vecs.genrows_mut().into_iter().zip(d.iter()) { col *= *val; + + // scale with first component + //col /= col[0]; } + //dbg!(&vecs); let steps = NumCast::from(steps).unwrap(); for (mut vec, val) in vecs.gencolumns_mut().into_iter().zip(vals.iter()) { vec *= val.powf(steps); diff --git a/linfa-reduction/src/kernel/sparse_gaussian.rs b/linfa-reduction/src/kernel/sparse_gaussian.rs index 172f3ffc6..b12946442 100644 --- a/linfa-reduction/src/kernel/sparse_gaussian.rs +++ b/linfa-reduction/src/kernel/sparse_gaussian.rs @@ -28,7 +28,7 @@ pub struct SparseGaussianKernel { } impl SparseGaussianKernel { - pub fn new(dataset: &Array2, k: usize) -> Self { + pub fn new(dataset: &Array2, k: usize, eps: f32) -> Self { let mut data = find_k_nearest_neighbours(dataset, k); for (i, mut vec) in data.outer_iterator_mut().enumerate() { @@ -39,7 +39,7 @@ impl SparseGaussianKernel { let distance = a.iter().zip(b.iter()).map(|(x,y)| (*x-*y)*(*x-*y)) .sum::(); - *val = (-distance / NumCast::from(60.0).unwrap()).exp(); + *val = (-distance / NumCast::from(eps).unwrap()).exp(); } } @@ -64,7 +64,7 @@ pub fn find_k_nearest_neighbours(dataset: &Array2, k: usize) -> CsM assert!(k > 0); let params = Params::new() - .ef_construction(100); + .ef_construction(1000); let mut searcher = Searcher::default(); let mut hnsw: HNSW> = HNSW::new_params(params); @@ -103,7 +103,7 @@ pub fn find_k_nearest_neighbours(dataset: &Array2, k: usize) -> CsM // push each index into the indices array for n in &neighbours { - if m < n.index { + if m != n.index { indices.push(n.index); data.push(NumCast::from(1.0).unwrap()); added += 1; diff --git a/linfa-reduction/src/utils.rs b/linfa-reduction/src/utils.rs index 830b195f4..e015cd752 100644 --- a/linfa-reduction/src/utils.rs +++ b/linfa-reduction/src/utils.rs @@ -1,4 +1,5 @@ use ndarray::{Array2, Ix2, Data, Axis, ArrayBase}; +use ndarray_rand::rand::Rng; /// Computes a similarity matrix with gaussian kernel and scaling parameter `eps` /// @@ -25,3 +26,29 @@ pub fn to_gaussian_similarity( similarity } +/// +/// Generates a three dimension swiss roll, centered at the origin with height `height` and +/// outwards speed `speed` +pub fn generate_swissroll( + height: f64, + speed: f64, + n_points: usize, + rng: &mut impl Rng, +) -> Array2 { + let mut roll: Array2 = Array2::zeros((n_points, 3)); + + for i in 0..n_points { + let z = rng.gen_range(0.0, height); + let phi: f64 = rng.gen_range(0.0, 10.0); + //let offset: f64 = rng.gen_range(-0.5, 0.5); + let offset = 0.0; + + let x = phi * phi.cos() + offset; + let y = phi * phi.sin() + offset; + + roll[(i, 0)] = x; + roll[(i, 1)] = y; + roll[(i, 2)] = z; + } + roll +} From e384ba3e810cb20e6f515c685d7eb22f1076c9e2 Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Wed, 8 Apr 2020 19:03:09 +0200 Subject: [PATCH 22/24] Simplify sparse kernels and add polynomial kernel --- linfa-reduction/examples/diffusion_map.rs | 23 +++----- .../src/diffusion_map/algorithms.rs | 3 +- linfa-reduction/src/kernel/dot.rs | 40 ------------- linfa-reduction/src/kernel/gaussian.rs | 54 +++++++++++------ linfa-reduction/src/kernel/mod.rs | 42 +++++++++++--- linfa-reduction/src/kernel/polynomial.rs | 58 +++++++++++++++++++ .../kernel/{sparse_gaussian.rs => sparse.rs} | 45 ++------------ linfa-reduction/src/utils.rs | 33 ++++++++++- 8 files changed, 174 insertions(+), 124 deletions(-) delete mode 100644 linfa-reduction/src/kernel/dot.rs create mode 100644 linfa-reduction/src/kernel/polynomial.rs rename linfa-reduction/src/kernel/{sparse_gaussian.rs => sparse.rs} (70%) diff --git a/linfa-reduction/examples/diffusion_map.rs b/linfa-reduction/examples/diffusion_map.rs index da9352a2d..5c474eb9b 100644 --- a/linfa-reduction/examples/diffusion_map.rs +++ b/linfa-reduction/examples/diffusion_map.rs @@ -1,7 +1,5 @@ -use linfa_reduction::{DiffusionMap, DiffusionMapHyperParams}; -use linfa_reduction::utils::generate_swissroll; -use linfa_reduction::kernel::{IntoKernel, DotKernel, SparseGaussianKernel, GaussianKernel}; -use ndarray::array; +use linfa_reduction::utils::generate_convoluted_rings; +use linfa_reduction::kernel::{IntoKernel, SparsePolynomialKernel}; use ndarray_npy::write_npy; use ndarray_rand::rand::SeedableRng; use rand_isaac::Isaac64Rng; @@ -13,20 +11,17 @@ fn main() { let mut rng = Isaac64Rng::seed_from_u64(42); // For each our expected centroids, generate `n` data points around it (a "blob") - let n = 2000; - let dataset = generate_swissroll(40.0, 1.0, n, &mut rng); - //let expected_centroids = array![[10., 10.], [1., 12.], [20., 30.], [-20., 30.],]; - //let dataset = generate_blobs(n, &expected_centroids, &mut rng); - //let similarity = to_gaussian_similarity(&dataset, 40.0); + let n = 3000; - //let diffusion_map = GaussianKernel::new(&dataset, 100.0) - // .reduce_fixed(2); + // generate three convoluted rings + let dataset = generate_convoluted_rings(&[(0.0, 3.0), (10.0, 13.0), (20.0, 23.0)], n, &mut rng); - let diffusion_map = SparseGaussianKernel::new(&dataset.clone(), 20, 100.0) - .reduce_fixed(2); + // generate sparse polynomial kernel with k = 14, c = 5 and d = 2 + let diffusion_map = SparsePolynomialKernel::new(&dataset, 14, 5.0, 2.0) + .reduce_fixed(4); + // get embedding let embedding = diffusion_map.embedding(); - //dbg!(&embedding); // Save to disk our dataset (and the cluster label assigned to each observation) // We use the `npy` format for compatibility with NumPy diff --git a/linfa-reduction/src/diffusion_map/algorithms.rs b/linfa-reduction/src/diffusion_map/algorithms.rs index 23b0f4883..3aaa62e5c 100644 --- a/linfa-reduction/src/diffusion_map/algorithms.rs +++ b/linfa-reduction/src/diffusion_map/algorithms.rs @@ -19,7 +19,7 @@ impl DiffusionMap { kernel: impl IntoKernel ) -> Self { // compute spectral embedding with diffusion map - let (embedding, eigvals) = compute_diffusion_map(kernel.into_kernel(), hyperparameters.steps(), 1.0, hyperparameters.embedding_size(), None); + let (embedding, eigvals) = compute_diffusion_map(kernel.into_kernel(), hyperparameters.steps(), 0.0, hyperparameters.embedding_size(), None); DiffusionMap { hyperparameters, @@ -99,6 +99,7 @@ fn compute_diffusion_map(kernel: impl Kernel, steps: usize, alpha: //let d = d.mapv(|x| x.recip()); + dbg!(&vals); let d = d.mapv(|x| x.sqrt()); for (mut col, val) in vecs.genrows_mut().into_iter().zip(d.iter()) { diff --git a/linfa-reduction/src/kernel/dot.rs b/linfa-reduction/src/kernel/dot.rs deleted file mode 100644 index a95716eed..000000000 --- a/linfa-reduction/src/kernel/dot.rs +++ /dev/null @@ -1,40 +0,0 @@ -use ndarray::{Array1, ArrayView2, Array2}; -use crate::Float; -use crate::kernel::{Kernel, IntoKernel}; - -pub struct DotKernel { - data: Array2 -} - -impl DotKernel { - pub fn new(data: Array2) -> DotKernel { - DotKernel { data } - } -} - -impl Kernel for DotKernel { - fn mul_similarity(&self, rhs: &ArrayView2) -> Array2 { - if self.data.ncols() > self.data.nrows() { - self.data.dot(&self.data.t().dot(rhs)) - } else { - self.data.t().dot(&self.data.dot(rhs)) - } - } - - fn size(&self) -> usize { - usize::min(self.data.ncols(), self.data.nrows()) - } - - fn sum(&self) -> Array1 { - self.mul_similarity(&Array2::ones((self.size(), 1)).view()) - .into_shape(self.size()).unwrap() - } -} - -impl IntoKernel for DotKernel { - type IntoKer = DotKernel; - - fn into_kernel(self) -> Self::IntoKer { - self - } -} diff --git a/linfa-reduction/src/kernel/gaussian.rs b/linfa-reduction/src/kernel/gaussian.rs index a36d6d00a..20de6f321 100644 --- a/linfa-reduction/src/kernel/gaussian.rs +++ b/linfa-reduction/src/kernel/gaussian.rs @@ -1,35 +1,28 @@ use num_traits::NumCast; -use ndarray::{Array2, Axis, NdFloat}; +use ndarray::{Array2, ArrayView1}; +use sprs::CsMat; use std::iter::Sum; use crate::Float; -use crate::kernel::IntoKernel; +use crate::kernel::{IntoKernel, dense_from_fn, sparse_from_fn}; + +fn kernel(a: ArrayView1, b: ArrayView1, eps: A) -> A { + let distance = a.iter().zip(b.iter()).map(|(x,y)| (*x-*y)*(*x-*y)) + .sum::(); + + (-distance / eps).exp() +} pub struct GaussianKernel { pub data: Array2 } -impl> GaussianKernel { +impl> GaussianKernel { pub fn new(dataset: &Array2, eps: f32) -> Self { let eps = NumCast::from(eps).unwrap(); - let n_observations = dataset.len_of(Axis(0)); - let mut similarity = Array2::eye(n_observations); - - for i in 0..n_observations { - for j in 0..n_observations { - let a = dataset.row(i); - let b = dataset.row(j); - - let distance = a.iter().zip(b.iter()).map(|(x,y)| (*x-*y)*(*x-*y)) - .sum::(); - - similarity[(i,j)] = (-distance / eps).exp(); - } - } - GaussianKernel { - data: similarity + data: dense_from_fn(dataset, |a,b| kernel(a,b,eps)) } } } @@ -41,3 +34,26 @@ impl IntoKernel for GaussianKernel { self.data } } + +pub struct SparseGaussianKernel { + similarity: CsMat +} + +impl SparseGaussianKernel { + pub fn new(dataset: &Array2, k: usize, eps: f32) -> Self { + let eps = NumCast::from(eps).unwrap(); + + SparseGaussianKernel { + similarity: sparse_from_fn(dataset, k, |a,b| kernel(a,b,eps)) + } + } +} + +impl IntoKernel for SparseGaussianKernel { + type IntoKer = CsMat; + + fn into_kernel(self) -> Self::IntoKer { + self.similarity + } +} + diff --git a/linfa-reduction/src/kernel/mod.rs b/linfa-reduction/src/kernel/mod.rs index d59ab1a86..808cbdd03 100644 --- a/linfa-reduction/src/kernel/mod.rs +++ b/linfa-reduction/src/kernel/mod.rs @@ -1,11 +1,9 @@ -pub mod dot; +pub mod sparse; pub mod gaussian; -pub mod sparse_gaussian; -pub use dot::DotKernel; -pub use gaussian::GaussianKernel; -pub use sparse_gaussian::SparseGaussianKernel; +pub mod polynomial; -use std::iter::Sum; +pub use gaussian::{GaussianKernel, SparseGaussianKernel}; +pub use polynomial::{PolynomialKernel, SparsePolynomialKernel}; use ndarray::prelude::*; use ndarray::Data; @@ -13,7 +11,6 @@ use sprs::CsMat; use crate::Float; use crate::diffusion_map::{DiffusionMapHyperParams, DiffusionMap}; -use crate::pca::PrincipalComponentAnalysis; /// Symmetric kernel function, can be sparse or dense /// @@ -84,8 +81,35 @@ pub trait IntoKernel { } } -pub enum Method { - Gaussian { eps: f32 }, +pub fn dense_from_fn, ArrayView1) -> A>(dataset: &Array2, fnc: T) -> Array2 { + let n_observations = dataset.len_of(Axis(0)); + let mut similarity = Array2::eye(n_observations); + + for i in 0..n_observations { + for j in 0..n_observations { + let a = dataset.row(i); + let b = dataset.row(j); + + similarity[(i, j)] = fnc(a, b); + } + } + + similarity +} + +pub fn sparse_from_fn, ArrayView1) -> A>(dataset: &Array2, k: usize, fnc: T) -> CsMat { + let mut data = sparse::adjacency_matrix(dataset, k); + + for (i, mut vec) in data.outer_iterator_mut().enumerate() { + for (j, val) in vec.iter_mut() { + let a = dataset.row(i); + let b = dataset.row(j); + + *val = fnc(a, b); + } + } + + data } #[cfg(test)] diff --git a/linfa-reduction/src/kernel/polynomial.rs b/linfa-reduction/src/kernel/polynomial.rs new file mode 100644 index 000000000..0052ef305 --- /dev/null +++ b/linfa-reduction/src/kernel/polynomial.rs @@ -0,0 +1,58 @@ +use num_traits::NumCast; +use ndarray::{Array2, ArrayView1}; +use sprs::CsMat; +use std::iter::Sum; + +use crate::Float; +use crate::kernel::{IntoKernel, dense_from_fn, sparse_from_fn}; + +fn kernel(a: ArrayView1, b: ArrayView1, c: A, d: A) -> A { + (a.dot(&b) + c).powf(d) +} + +pub struct PolynomialKernel { + pub data: Array2 +} + +impl> PolynomialKernel { + pub fn new(dataset: &Array2, c: f32, d: f32) -> Self { + let c = NumCast::from(c).unwrap(); + let d = NumCast::from(d).unwrap(); + + PolynomialKernel { + data: dense_from_fn(dataset, |a,b| kernel(a,b,c,d)) + } + } +} + +impl IntoKernel for PolynomialKernel { + type IntoKer = Array2; + + fn into_kernel(self) -> Self::IntoKer { + self.data + } +} + +pub struct SparsePolynomialKernel { + similarity: CsMat +} + +impl SparsePolynomialKernel { + pub fn new(dataset: &Array2, k: usize, c: A, d: A) -> Self { + let c = NumCast::from(c).unwrap(); + let d = NumCast::from(d).unwrap(); + + SparsePolynomialKernel { + similarity: sparse_from_fn(dataset, k, |a,b| kernel(a,b,c,d)) + } + } +} + +impl IntoKernel for SparsePolynomialKernel { + type IntoKer = CsMat; + + fn into_kernel(self) -> Self::IntoKer { + self.similarity + } +} + diff --git a/linfa-reduction/src/kernel/sparse_gaussian.rs b/linfa-reduction/src/kernel/sparse.rs similarity index 70% rename from linfa-reduction/src/kernel/sparse_gaussian.rs rename to linfa-reduction/src/kernel/sparse.rs index b12946442..8a7dd2bf4 100644 --- a/linfa-reduction/src/kernel/sparse_gaussian.rs +++ b/linfa-reduction/src/kernel/sparse.rs @@ -6,8 +6,8 @@ use ndarray::{Array2, Axis, ArrayView1}; use num_traits::NumCast; use crate::Float; -use crate::kernel::IntoKernel; +/// Implementation of euclidean distance for ndarray struct Euclidean<'a, A>(ArrayView1<'a, A>); impl MetricPoint for Euclidean<'_, A> { @@ -23,39 +23,8 @@ impl MetricPoint for Euclidean<'_, A> { } } -pub struct SparseGaussianKernel { - data: CsMat -} - -impl SparseGaussianKernel { - pub fn new(dataset: &Array2, k: usize, eps: f32) -> Self { - let mut data = find_k_nearest_neighbours(dataset, k); - - for (i, mut vec) in data.outer_iterator_mut().enumerate() { - for (j, mut val) in vec.iter_mut() { - let a = dataset.row(i); - let b = dataset.row(j); - - let distance = a.iter().zip(b.iter()).map(|(x,y)| (*x-*y)*(*x-*y)) - .sum::(); - - *val = (-distance / NumCast::from(eps).unwrap()).exp(); - } - } - - SparseGaussianKernel { data } - } -} - -impl IntoKernel for SparseGaussianKernel { - type IntoKer = CsMat; - - fn into_kernel(self) -> Self::IntoKer { - self.data - } -} - -pub fn find_k_nearest_neighbours(dataset: &Array2, k: usize) -> CsMat { +/// Create sparse adjacency matrix from dense dataset +pub fn adjacency_matrix(dataset: &Array2, k: usize) -> CsMat { let n_points = dataset.len_of(Axis(0)); // ensure that the number of neighbours is at least one and less than the total number of @@ -64,7 +33,7 @@ pub fn find_k_nearest_neighbours(dataset: &Array2, k: usize) -> CsM assert!(k > 0); let params = Params::new() - .ef_construction(1000); + .ef_construction(k); let mut searcher = Searcher::default(); let mut hnsw: HNSW> = HNSW::new_params(params); @@ -117,12 +86,10 @@ pub fn find_k_nearest_neighbours(dataset: &Array2, k: usize) -> CsM // create CSR matrix from data, indptr and indices let mat = CsMatBase::new((n_points, n_points), indptr, indices, data); let mut mat = &mat + &mat.transpose_view(); - //dbg!(mat.to_dense()); + // ensure that all values are one let val: A = NumCast::from(1.0).unwrap(); - for i in 0..(n_points) { - mat.set(i, i, val); - } + mat.map_inplace(|_| val); mat } diff --git a/linfa-reduction/src/utils.rs b/linfa-reduction/src/utils.rs index e015cd752..768f0d0f1 100644 --- a/linfa-reduction/src/utils.rs +++ b/linfa-reduction/src/utils.rs @@ -1,5 +1,6 @@ use ndarray::{Array2, Ix2, Data, Axis, ArrayBase}; use ndarray_rand::rand::Rng; +use num_traits::float::FloatConst; /// Computes a similarity matrix with gaussian kernel and scaling parameter `eps` /// @@ -43,8 +44,8 @@ pub fn generate_swissroll( //let offset: f64 = rng.gen_range(-0.5, 0.5); let offset = 0.0; - let x = phi * phi.cos() + offset; - let y = phi * phi.sin() + offset; + let x = speed * phi * phi.cos() + offset; + let y = speed * phi * phi.sin() + offset; roll[(i, 0)] = x; roll[(i, 1)] = y; @@ -52,3 +53,31 @@ pub fn generate_swissroll( } roll } + +pub fn generate_convoluted_rings( + rings: &[(f64, f64)], + n_points: usize, + rng: &mut impl Rng +) -> Array2 { + let n_points = (n_points as f32 / rings.len() as f32).ceil() as usize; + let mut array = Array2::zeros((n_points * rings.len(), 3)); + + for (n, (start, end)) in rings.into_iter().enumerate() { + // inner circle + for i in 0..n_points { + let r: f64 = rng.gen_range(start, end); + let phi: f64 = rng.gen_range(0.0, f64::PI() * 2.0); + let theta: f64 = rng.gen_range(0.0, f64::PI() * 2.0); + + let x = theta.sin() * phi.cos() * r; + let y = theta.sin() * phi.sin() * r; + let z = theta.cos() * r; + + array[(n * n_points + i, 0)] = x; + array[(n * n_points + i, 1)] = y; + array[(n * n_points + i, 2)] = z; + } + } + + array +} From 9414bfb130d2db686e84c9be3c4430dc2d9a4998 Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Wed, 8 Apr 2020 19:07:44 +0200 Subject: [PATCH 23/24] Remove unnecessary debugging output --- linfa-reduction/src/diffusion_map/algorithms.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/linfa-reduction/src/diffusion_map/algorithms.rs b/linfa-reduction/src/diffusion_map/algorithms.rs index 3aaa62e5c..809f71e69 100644 --- a/linfa-reduction/src/diffusion_map/algorithms.rs +++ b/linfa-reduction/src/diffusion_map/algorithms.rs @@ -97,19 +97,12 @@ fn compute_diffusion_map(kernel: impl Kernel, steps: usize, alpha: ) }; - //let d = d.mapv(|x| x.recip()); - - dbg!(&vals); let d = d.mapv(|x| x.sqrt()); for (mut col, val) in vecs.genrows_mut().into_iter().zip(d.iter()) { col *= *val; - - // scale with first component - //col /= col[0]; } - //dbg!(&vecs); let steps = NumCast::from(steps).unwrap(); for (mut vec, val) in vecs.gencolumns_mut().into_iter().zip(vals.iter()) { vec *= val.powf(steps); From 7c29902a562a7a8d21a84aae54ba104aec436abd Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Mon, 13 Jul 2020 10:58:05 +0200 Subject: [PATCH 24/24] Comment on Principal Component Analysis --- linfa-clustering/Cargo.toml | 3 +-- linfa-reduction/Cargo.toml | 2 +- .../src/diffusion_map/algorithms.rs | 2 +- linfa-reduction/src/lib.rs | 1 + linfa-reduction/src/pca/algorithms.rs | 19 +++++++++++++++++-- 5 files changed, 21 insertions(+), 6 deletions(-) diff --git a/linfa-clustering/Cargo.toml b/linfa-clustering/Cargo.toml index 701bef888..1e9bae844 100644 --- a/linfa-clustering/Cargo.toml +++ b/linfa-clustering/Cargo.toml @@ -16,8 +16,7 @@ categories = ["algorithms", "mathematics", "science"] ndarray = { version = "0.13" , features = ["rayon", "serde", "approx"]} ndarray-rand = "0.11" ndarray-stats = "0.3" -#ndarray-linalg = { version = "0.12", features = ["openblas"] } -ndarray-linalg = { path = "../../ndarray-linalg", features = ["openblas"] } +ndarray-linalg = { version = "0.12", features = ["openblas"] } sprs = "0.7" serde = { version = "1", features = ["derive"] } num-traits = "0.1.32" diff --git a/linfa-reduction/Cargo.toml b/linfa-reduction/Cargo.toml index 35306e51f..fcce021a9 100644 --- a/linfa-reduction/Cargo.toml +++ b/linfa-reduction/Cargo.toml @@ -16,7 +16,7 @@ categories = ["algorithms", "mathematics", "science"] ndarray = { version = "0.13" , features = ["rayon", "serde", "approx"]} ndarray-rand = "0.11" ndarray-stats = "0.3" -ndarray-linalg = { path = "../../ndarray-linalg", features = ["openblas"] } +ndarray-linalg = { version = "0.12", features = ["openblas"] } sprs = "0.7" serde = { version = "1", features = ["derive"] } num-traits = "0.1.32" diff --git a/linfa-reduction/src/diffusion_map/algorithms.rs b/linfa-reduction/src/diffusion_map/algorithms.rs index 809f71e69..333c113f3 100644 --- a/linfa-reduction/src/diffusion_map/algorithms.rs +++ b/linfa-reduction/src/diffusion_map/algorithms.rs @@ -85,7 +85,7 @@ fn compute_diffusion_map(kernel: impl Kernel, steps: usize, alpha: }, x, |_| {}, None, 1e-15, 200, TruncatedOrder::Largest); let (vals, vecs) = match result { - LobpcgResult::Ok(vals, vecs, n) | LobpcgResult::Err(vals, vecs, n, _) => { dbg!(&n); (vals, vecs)}, + LobpcgResult::Ok(vals, vecs, _) | LobpcgResult::Err(vals, vecs, _, _) => { (vals, vecs)}, _ => panic!("Eigendecomposition failed!") }; diff --git a/linfa-reduction/src/lib.rs b/linfa-reduction/src/lib.rs index 2086e3567..db5777375 100644 --- a/linfa-reduction/src/lib.rs +++ b/linfa-reduction/src/lib.rs @@ -7,6 +7,7 @@ pub mod utils; pub use diffusion_map::{DiffusionMap, DiffusionMapHyperParams}; pub use utils::to_gaussian_similarity; +pub use pca::PrincipalComponentAnalysis; use ndarray::NdFloat; use ndarray_linalg::Lapack; diff --git a/linfa-reduction/src/pca/algorithms.rs b/linfa-reduction/src/pca/algorithms.rs index 67da32159..c58948cb9 100644 --- a/linfa-reduction/src/pca/algorithms.rs +++ b/linfa-reduction/src/pca/algorithms.rs @@ -1,6 +1,12 @@ +///! Principal Component Analysis +/// +/// Reduce dimensionality with a linear projection using Singular Value Decomposition. The data is +/// centered before applying the SVD. This uses TruncatedSvd from ndarray-linalg package. + use ndarray::{ArrayBase, Array1, Array2, Ix2, Axis, DataMut}; use ndarray_linalg::{TruncatedSvd, TruncatedOrder}; +/// Pincipal Component Analysis pub struct PrincipalComponentAnalysis { embedding: Array2, explained_variance: Array1, @@ -12,16 +18,18 @@ impl PrincipalComponentAnalysis { mut dataset: ArrayBase, embedding_size: usize ) -> Self { + // calculate mean of data and subtract it let mean = dataset.mean_axis(Axis(0)).unwrap(); - dataset -= &mean; + // estimate Singular Value Decomposition let result = TruncatedSvd::new(dataset.to_owned(), TruncatedOrder::Largest) .decompose(embedding_size) .unwrap(); + // explained variance is the spectral distribution of the eigenvalues let (_, sigma, v_t) = result.values_vectors(); - let explained_variance = sigma.mapv(|x| x*x / dataset.len() as f64); + let explained_variance = sigma.mapv(|x| x*x / (dataset.len() as f64 - 1.0)); PrincipalComponentAnalysis { embedding: v_t, @@ -30,6 +38,7 @@ impl PrincipalComponentAnalysis { } } + /// Given a new data points project with fitted model pub fn predict>( &self, dataset: &ArrayBase @@ -37,7 +46,13 @@ impl PrincipalComponentAnalysis { (dataset - &self.mean).dot(&self.embedding.t()) } + /// Return the amount of explained variance per element pub fn explained_variance(&self) -> Array1 { self.explained_variance.clone() } + + /// Return the normalized amount of explained variance per element + pub fn explained_variance_ratio(&self) -> Array1 { + &self.explained_variance / self.explained_variance.sum() + } }