Skip to content
This repository was archived by the owner on Jul 16, 2021. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 37 additions & 20 deletions src/data/transforms/minmax.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
//! ```

use learning::error::{Error, ErrorKind};
use linalg::{Matrix, BaseMatrix, BaseMatrixMut};
use linalg::{Matrix, BaseMatrix, BaseMatrixMut, Vector};
use super::{Invertible, Transformer};

use rulinalg::utils;
Expand All @@ -42,9 +42,9 @@ use libnum::Float;
#[derive(Debug)]
pub struct MinMaxScaler<T: Float> {
/// Values to scale each column by
scale_factors: Option<Vec<T>>,
scale_factors: Option<Vector<T>>,
/// Values to add to each column after scaling
const_factors: Option<Vec<T>>,
const_factors: Option<Vector<T>>,
/// The min of the new data (default 0)
scaled_min: T,
/// The max of the new data (default 1)
Expand Down Expand Up @@ -82,9 +82,12 @@ impl<T: Float> MinMaxScaler<T> {
}

impl<T: Float> Transformer<Matrix<T>> for MinMaxScaler<T> {
fn transform(&mut self, mut inputs: Matrix<T>) -> Result<Matrix<T>, Error> {

fn fit(&mut self, inputs: &Matrix<T>) -> Result<(), Error> {
let features = inputs.cols();

// ToDo: can use min, max
// https://github.com/AtheMathmo/rulinalg/pull/115
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to note that this implementation may actually be more efficient as we handle it on one pass of the rows.

Certainly worth checking though (besides - we will have some other breaking changes by the time this PR lands).

let mut input_min_max = vec![(T::max_value(), T::min_value()); features];

for row in inputs.iter_rows() {
Expand All @@ -95,18 +98,14 @@ impl<T: Float> Transformer<Matrix<T>> for MinMaxScaler<T> {
processed",
idx)));
}

// Update min
if *feature < min_max.0 {
min_max.0 = *feature;
}

// Update max
if *feature > min_max.1 {
min_max.1 = *feature;
}


}
}

Expand All @@ -130,28 +129,46 @@ impl<T: Float> Transformer<Matrix<T>> for MinMaxScaler<T> {
.map(|(&(_, x), &s)| self.scaled_max - x * s)
.collect::<Vec<_>>();

for row in inputs.iter_rows_mut() {
utils::in_place_vec_bin_op(row, &scales, |x, &y| {
*x = *x * y;
});
self.scale_factors = Some(Vector::new(scales));
self.const_factors = Some(Vector::new(consts));
Ok(())
}

utils::in_place_vec_bin_op(row, &consts, |x, &y| {
*x = *x + y;
});
fn transform(&mut self, mut inputs: Matrix<T>) -> Result<Matrix<T>, Error> {
if let (&None, &None) = (&self.scale_factors, &self.const_factors) {
// if Transformer is not fitted to the data, fit for backward-compat.
try!(self.fit(&inputs));
}

self.scale_factors = Some(scales);
self.const_factors = Some(consts);

Ok(inputs)
if let (&Some(ref scales), &Some(ref consts)) = (&self.scale_factors, &self.const_factors) {
if scales.size() != inputs.cols() {
Err(Error::new(ErrorKind::InvalidData,
"Input data has different number of columns from fitted data."))
} else {
for row in inputs.iter_rows_mut() {
utils::in_place_vec_bin_op(row, scales.data(), |x, &y| {
*x = *x * y;
});

utils::in_place_vec_bin_op(row, consts.data(), |x, &y| {
*x = *x + y;
});
}
Ok(inputs)
}
} else {
// can't happen
Err(Error::new(ErrorKind::InvalidState, "Transformer has not been fitted."))
}
}
}

impl<T: Float> Invertible<Matrix<T>> for MinMaxScaler<T> {

fn inv_transform(&self, mut inputs: Matrix<T>) -> Result<Matrix<T>, Error> {
if let (&Some(ref scales), &Some(ref consts)) = (&self.scale_factors, &self.const_factors) {

let features = scales.len();
let features = scales.size();
if inputs.cols() != features {
return Err(Error::new(ErrorKind::InvalidData,
"Inputs have different feature count than transformer."));
Expand Down
4 changes: 3 additions & 1 deletion src/data/transforms/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
//! The `Transformer` trait provides a shared interface for all of the
//! data preprocessing transformations in rusty-machine.
//!
//! The transformers provide preprocessing transformations which are
//! The transformers provide preprocessing transformations which are
//! commonly used in machine learning.

pub mod minmax;
Expand All @@ -21,6 +21,8 @@ pub use self::standardize::Standardizer;

/// Trait for data transformers
pub trait Transformer<T> {
/// Fit Transformer to input data, and stores the transformation in the Transformer
fn fit(&mut self, inputs: &T) -> Result<(), error::Error>;
/// Transforms the inputs and stores the transformation in the Transformer
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would add to this comment and state that if this function is used without fitting first then the Transformer will call fit itself.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A point is whether calling transform before fit is allowed as standard behavior.

I'm + 1 to remove it in future. thus better to show warning and do not describe the behavior in the doc?

fn transform(&mut self, inputs: T) -> Result<T, error::Error>;
}
Expand Down
19 changes: 19 additions & 0 deletions src/data/transforms/shuffle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
//! ```

use learning::LearningResult;
use learning::error::Error;
use linalg::{Matrix, BaseMatrix, BaseMatrixMut};
use super::Transformer;

Expand Down Expand Up @@ -74,6 +75,12 @@ impl Default for Shuffler<ThreadRng> {
///
/// Under the hood this uses a Fisher-Yates shuffle.
impl<R: Rng, T> Transformer<Matrix<T>> for Shuffler<R> {

#[allow(unused_variables)]
fn fit(&mut self, inputs: &Matrix<T>) -> Result<(), Error> {
Ok(())
}

fn transform(&mut self, mut inputs: Matrix<T>) -> LearningResult<Matrix<T>> {
let n = inputs.rows();

Expand Down Expand Up @@ -117,4 +124,16 @@ mod tests {
assert_eq!(shuffled.into_vec(),
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
}

#[test]
fn shuffle_fit() {
let rng = StdRng::from_seed(&[1, 2, 3]);
let mut shuffler = Shuffler::new(rng);

// no op
let mat = Matrix::new(4, 2, vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
let res = shuffler.fit(&mat).unwrap();

assert_eq!(res, ());
}
}
39 changes: 28 additions & 11 deletions src/data/transforms/standardize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ impl<T: Float> Standardizer<T> {
}

impl<T: Float + FromPrimitive> Transformer<Matrix<T>> for Standardizer<T> {
fn transform(&mut self, mut inputs: Matrix<T>) -> Result<Matrix<T>, Error> {

fn fit(&mut self, inputs: &Matrix<T>) -> Result<(), Error> {
if inputs.rows() <= 1 {
Err(Error::new(ErrorKind::InvalidData,
"Cannot standardize data with only one row."))
Expand All @@ -100,18 +101,34 @@ impl<T: Float + FromPrimitive> Transformer<Matrix<T>> for Standardizer<T> {
if mean.data().iter().any(|x| !x.is_finite()) {
return Err(Error::new(ErrorKind::InvalidData, "Some data point is non-finite."));
}

for row in inputs.iter_rows_mut() {
// Subtract the mean
utils::in_place_vec_bin_op(row, &mean.data(), |x, &y| *x = *x - y);
utils::in_place_vec_bin_op(row, &variance.data(), |x, &y| {
*x = (*x * self.scaled_stdev / y.sqrt()) + self.scaled_mean
});
}

self.means = Some(mean);
self.variances = Some(variance);
Ok(inputs)
Ok(())
}
}

fn transform(&mut self, mut inputs: Matrix<T>) -> Result<Matrix<T>, Error> {
if let (&None, &None) = (&self.means, &self.variances) {
// if Transformer is not fitted to the data, fit for backward-compat.
try!(self.fit(&inputs));
}

if let (&Some(ref means), &Some(ref variances)) = (&self.means, &self.variances) {
if means.size() != inputs.cols() {
Err(Error::new(ErrorKind::InvalidData,
"Input data has different number of columns from fitted data."))
} else {
for row in inputs.iter_rows_mut() {
// Subtract the mean
utils::in_place_vec_bin_op(row, means.data(), |x, &y| *x = *x - y);
utils::in_place_vec_bin_op(row, variances.data(), |x, &y| {
*x = (*x * self.scaled_stdev / y.sqrt()) + self.scaled_mean
});
}
Ok(inputs)
}
} else {
Err(Error::new(ErrorKind::InvalidState, "Transformer has not been fitted."))
}
}
}
Expand Down