From eb984feeb9202b02bcd2f7b825fa2fc2ff313a73 Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Sat, 3 Sep 2016 01:35:05 -0400
Subject: [PATCH 01/28] Implemented RMSProp

---
 src/learning/optim/grad_desc.rs   | 130 +++++++++++++++++++++++++++++-
 tests/learning/optim/grad_desc.rs |  17 +++-
 2 files changed, 145 insertions(+), 2 deletions(-)
diff --git a/src/learning/optim/grad_desc.rs b/src/learning/optim/grad_desc.rs
index 73df8fa6..26c96631 100644
--- a/src/learning/optim/grad_desc.rs
+++ b/src/learning/optim/grad_desc.rs
@@ -303,10 +303,121 @@ impl<M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>>> OptimAlgorithm
     }
 }
 
+/// RMSProp 
+///
+/// The RMSProp algorithm (Hinton et al. 2012).
+#[derive(Debug, Clone, Copy)]
+pub struct RMSProp {
+    /// The base step size of gradient descent steps 
+    learning_rate: f64,
+    /// Factor of averaged square gradients to keep
+    decay_rate: f64,
+    /// Small value used to avoid divide by zero
+    epsilon: f64,
+    /// The number of passes through the data
+    iters: usize,
+}
+
+/// The default RMSProp configuration
+///
+/// The defaults are:
+///
+/// - learning_rate = 0.01
+/// - decay_rate = 0.9
+/// - epsilon = 1.0e-5
+/// - iters = 50
+impl Default for RMSProp {
+    fn default() -> RMSProp {
+        RMSProp {
+            learning_rate: 0.01,
+            decay_rate: 0.9,
+            epsilon: 1.0e-5,
+            iters: 50
+        }
+    }
+}
+
+impl RMSProp {
+    /// Construct an RMSProp algorithm.
+    ///
+    /// Requires learning rate, decay rate, epsilon, and iteration count.
+    ///
+    /// #Examples
+    ///
+    /// ```
+    /// use rusty_machine::learning::optim::grad_desc::RMSProp;
+    ///
+    /// let rms = RMSProp::new(0.99, 0.01, 1e-5, 20);
+    /// ```
+    pub fn new(learning_rate: f64, decay_rate: f64, epsilon: f64, iters: usize) -> RMSProp {
+        assert!(0f64 < learning_rate, "The learning rate must be positive");
+        assert!(0f64 < decay_rate && decay_rate < 1f64, "The decay rate must be between 0 and 1");
+        assert!(0f64 < epsilon, "Epsilon must be positive");
+
+        RMSProp {
+            decay_rate: decay_rate,
+            learning_rate: learning_rate,
+            epsilon: epsilon,
+            iters: iters
+        }
+    }
+}
+
+impl<M> OptimAlgorithm<M> for RMSProp
+    where M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>> {
+    fn optimize(&self,
+                model: &M,
+                start: &[f64],
+                inputs: &M::Inputs,
+                targets: &M::Targets)
+                -> Vec<f64> {
+        // Initial parameters
+        let mut params = Vector::new(start.to_vec());
+        // Running average of squared gradients
+        let mut rmsprop_cache = Vector::zeros(start.len());
+
+        // Set up indices for permutation
+        let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
+        // The cost at the start of each iteration
+        let mut start_iter_cost = 0f64;
+
+        for _ in 0..self.iters {
+            // The cost at end of each pass
+            let mut end_cost = 0f64;
+            // Permute the vertices
+            rand_utils::in_place_fisher_yates(&mut permutation);
+            for i in &permutation {
+                let (cost, grad) = model.compute_grad(params.data(),
+                                                      &inputs.select_rows(&[*i]),
+                                                      &targets.select_rows(&[*i]));
+
+                let grad = Vector::new(grad);
+                let grad_squared = grad.clone().apply(&|x| x*x);
+                // Update cached average of squared gradients
+                rmsprop_cache = &rmsprop_cache*self.decay_rate + &grad_squared*(1.0 - self.decay_rate);
+                // RMSProp update rule 
+                params = &params - (&grad*self.learning_rate).elediv(
+                                    &rmsprop_cache.clone().apply(&|x| (x + self.epsilon).sqrt()));
+                
+                end_cost += cost;
+            }
+            end_cost /= inputs.rows() as f64;
+
+            // Early stopping
+            if (start_iter_cost - end_cost).abs() < LEARNING_EPS {
+                break;
+            } else {
+                start_iter_cost = end_cost;
+            }
+        }
+        params.into_vec()
+    }
+}
+
 #[cfg(test)]
 mod tests {
 
-    use super::{GradientDesc, StochasticGD, AdaGrad};
+    use super::{GradientDesc, StochasticGD, AdaGrad, RMSProp};
 
     #[test]
     #[should_panic]
@@ -338,4 +449,21 @@ mod tests {
         let _ = AdaGrad::new(0.5, -1f64, 0);
     }
 
+    #[test]
+    #[should_panic]
+    fn rmsprop_neg_decay_rate() {
+        let _ = RMSProp::new(-0.5, 0.005, 1.0e-5, 0);
+    }
+
+    #[test]
+    #[should_panic]
+    fn rmsprop_neg_epsilon() {
+        let _ = RMSProp::new(0.5, 0.005, -1.0e-5, 0);
+    }
+
+    #[test]
+    #[should_panic]
+    fn rmsprop_neg_learning_rate() {
+        let _ = RMSProp::new(0.5, -0.005, 1.0e-5, 0);
+    }
 }
diff --git a/tests/learning/optim/grad_desc.rs b/tests/learning/optim/grad_desc.rs
index 97342fae..f9f74303 100644
--- a/tests/learning/optim/grad_desc.rs
+++ b/tests/learning/optim/grad_desc.rs
@@ -1,6 +1,6 @@
 use rm::learning::optim::Optimizable;
 use rm::learning::optim::fmincg::ConjugateGD;
-use rm::learning::optim::grad_desc::{GradientDesc, StochasticGD, AdaGrad};
+use rm::learning::optim::grad_desc::{GradientDesc, StochasticGD, AdaGrad, RMSProp};
 use rm::learning::optim::OptimAlgorithm;
 
 use rm::linalg::Matrix;
@@ -82,4 +82,19 @@ fn convex_adagrad_training() {
 
     assert!(params[0] - 20f64 < 1e-10);
     assert!(x_sq.compute_grad(&params, &Matrix::zeros(1, 1), &Matrix::zeros(1, 1)).0 < 1e-10);
+}
+
+#[test]
+fn convex_rmsprop_training() {
+  let x_sq = XSqModel { c: 20f64 };
+
+  let rms = RMSProp::new(0.05, 0.9, 1e-5, 50);
+  let test_data = vec![100f64];
+  let params = rms.optimize(&x_sq,
+                              &test_data[..],
+                              &Matrix::zeros(100, 1),
+                              &Matrix::zeros(100, 1));
+
+  assert!(params[0] - 20f64 < 1e-10);
+  assert!(x_sq.compute_grad(&params, &Matrix::zeros(1, 1), &Matrix::zeros(1, 1)).0 < 1e-10);
 }
\ No newline at end of file

From 5106ed6eb4c2f30602925c657f639b65a2150789 Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Sat, 3 Sep 2016 01:46:02 -0400
Subject: [PATCH 02/28] Made a few RMSProp comments more readable

---
 src/learning/optim/grad_desc.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/learning/optim/grad_desc.rs b/src/learning/optim/grad_desc.rs
index 26c96631..4ec34222 100644
--- a/src/learning/optim/grad_desc.rs
+++ b/src/learning/optim/grad_desc.rs
@@ -310,7 +310,7 @@ impl<M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>>> OptimAlgorithm
 pub struct RMSProp {
     /// The base step size of gradient descent steps 
     learning_rate: f64,
-    /// Factor of averaged square gradients to keep
+    /// Rate at which running total of average square gradients decays
     decay_rate: f64,
     /// Small value used to avoid divide by zero
     epsilon: f64,
@@ -378,8 +378,8 @@ impl<M> OptimAlgorithm<M> for RMSProp
 
         // Set up indices for permutation
         let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
-        // The cost at the start of each iteration
-        let mut start_iter_cost = 0f64;
+        // The cost from the previous iteration
+        let mut prev_cost = 0f64;
 
         for _ in 0..self.iters {
             // The cost at end of each pass
@@ -397,17 +397,17 @@ impl<M> OptimAlgorithm<M> for RMSProp
                 rmsprop_cache = &rmsprop_cache*self.decay_rate + &grad_squared*(1.0 - self.decay_rate);
                 // RMSProp update rule 
                 params = &params - (&grad*self.learning_rate).elediv(
-                                    &rmsprop_cache.clone().apply(&|x| (x + self.epsilon).sqrt()));
+                                        &rmsprop_cache.clone().apply(&|x| (x + self.epsilon).sqrt()));
                 
                 end_cost += cost;
             }
             end_cost /= inputs.rows() as f64;
 
             // Early stopping
-            if (start_iter_cost - end_cost).abs() < LEARNING_EPS {
+            if (prev_cost - end_cost).abs() < LEARNING_EPS {
                 break;
             } else {
-                start_iter_cost = end_cost;
+                prev_cost = end_cost;
             }
         }
         params.into_vec()

From 24f9d2b09a7a1a1c10762f6b26417f3bb2a118a0 Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Sat, 3 Sep 2016 14:01:34 -0400
Subject: [PATCH 03/28] Removed clone

---
 src/learning/optim/grad_desc.rs | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/learning/optim/grad_desc.rs b/src/learning/optim/grad_desc.rs
index 4ec34222..a8c50530 100644
--- a/src/learning/optim/grad_desc.rs
+++ b/src/learning/optim/grad_desc.rs
@@ -391,14 +391,16 @@ impl<M> OptimAlgorithm<M> for RMSProp
                                                       &inputs.select_rows(&[*i]),
                                                       &targets.select_rows(&[*i]));
 
-                let grad = Vector::new(grad);
+                let mut grad = Vector::new(grad);
                 let grad_squared = grad.clone().apply(&|x| x*x);
                 // Update cached average of squared gradients
                 rmsprop_cache = &rmsprop_cache*self.decay_rate + &grad_squared*(1.0 - self.decay_rate);
                 // RMSProp update rule 
-                params = &params - (&grad*self.learning_rate).elediv(
-                                        &rmsprop_cache.clone().apply(&|x| (x + self.epsilon).sqrt()));
-                
+                utils::in_place_vec_bin_op(grad.mut_data(), rmsprop_cache.data(), |x, &y| {
+                    *x = *x * self.learning_rate / (y + self.epsilon).sqrt();
+                });
+                params = &params - &grad;
+
                 end_cost += cost;
             }
             end_cost /= inputs.rows() as f64;

From 1e15d5a124d306863f6e86257ebfa58f87487b9e Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Thu, 8 Sep 2016 12:41:31 -0400
Subject: [PATCH 04/28] Created NetLayer trait

---
 src/learning/nnet.rs              |   2 +
 src/learning/toolkit/net_layer.rs | 105 ++++++++++++++++++++++++++++++
 src/lib.rs                        |   1 +
 3 files changed, 108 insertions(+)
 create mode 100644 src/learning/toolkit/net_layer.rs

diff --git a/src/learning/nnet.rs b/src/learning/nnet.rs
index 06a4a731..a55484c1 100644
--- a/src/learning/nnet.rs
+++ b/src/learning/nnet.rs
@@ -50,6 +50,8 @@ use learning::toolkit::activ_fn::ActivationFunc;
 use learning::toolkit::cost_fn;
 use learning::toolkit::cost_fn::CostFunc;
 use learning::toolkit::regularization::Regularization;
+use learning::toolkit::net_layer;
+use learning::toolkit::net_layer::NetLayer;
 use learning::optim::{Optimizable, OptimAlgorithm};
 use learning::optim::grad_desc::StochasticGD;
 
diff --git a/src/learning/toolkit/net_layer.rs b/src/learning/toolkit/net_layer.rs
new file mode 100644
index 00000000..f6a5184d
--- /dev/null
+++ b/src/learning/toolkit/net_layer.rs
@@ -0,0 +1,105 @@
+//!Neural Network Layers
+
+use linalg::{Matrix, MatrixSlice};
+use linalg::BaseSlice;
+
+use learning::toolkit::activ_fn::ActivationFunc;
+
+use rand::thread_rng;
+use rand::distributions::Sample;
+use rand::distributions::normal::Normal;
+
+/// Trait for neural net layers
+pub trait NetLayer {
+	/// The result of propogating data forward through this layer
+	fn forward(&self, input: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64>;
+
+	/// The gradient of the output of this layer with respect to its input
+	fn backward(&self, out_grad: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64>;
+
+	/// The default value of the parameters of this layer before training
+	fn default_params(&self) -> Vec<f64>;
+
+	/// The number of parameters used by this layer
+	fn num_params(&self) -> usize;
+
+	/// The shape of the parameters used by this layer
+	fn param_shape(&self) -> (usize, usize);
+}
+
+/// Linear network layer
+///
+/// The parameters are a matrix of weights of size O x I
+/// where O is the dimensionality of the output and I the dimensionality of the input
+#[derive(Debug, Clone, Copy)]
+pub struct Linear {
+	/// The number of dimensions of the input
+	input_size: usize,
+	/// The number of dimensions of the output
+	output_size: usize,
+}
+
+impl Linear {
+	/// Construct a Linear layer
+	pub fn new(input_size: usize, output_size: usize) -> Linear {
+		Linear{input_size: input_size, output_size: output_size}
+	}
+}
+
+impl NetLayer for Linear {
+	/// Computes a matrix product
+	///
+	/// input should have dimensions N x I
+	/// where N is the number of samples and I is the dimensionality of the input
+	fn forward(&self, input: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64> {
+		assert_eq!(input.cols(), params.rows());
+		input * &params
+	}
+
+	fn backward(&self, out_grad: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64> {
+		assert_eq!(out_grad.cols(), params.cols());
+		out_grad * params.into_matrix().transpose()
+	}
+
+	/// Initializes weights using Xavier initialization
+	///
+	/// weights draw from gaussian distribution with 0 mean and variance 2/(input_size+output_size)
+	fn default_params(&self) -> Vec<f64> {
+		let mut distro = Normal::new(0.0, (2.0/(self.input_size+self.output_size) as f64).sqrt());
+		let mut rng = thread_rng();
+
+		(0..self.input_size*self.output_size).map(|_| distro.sample(&mut rng))
+											 .collect()
+	}
+
+	fn num_params(&self) -> usize {
+		self.output_size * self.input_size
+	}
+
+	fn param_shape(&self) -> (usize, usize) {
+		(self.input_size, self.output_size)
+	}
+}
+
+impl<T: ActivationFunc> NetLayer for T {
+	/// Applys the activation function to each element of the input
+	fn forward(&self, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
+		input.clone().apply(&T::func)
+	}
+
+	fn backward(&self, out_grad: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
+		out_grad.clone().apply(&T::func_grad)
+	}
+
+	fn default_params(&self) -> Vec<f64> {
+		vec![]
+	}
+
+	fn num_params(&self) -> usize {
+		0
+	}
+
+	fn param_shape(&self) -> (usize, usize) {
+		(0, 0)
+	}
+}
\ No newline at end of file
diff --git a/src/lib.rs b/src/lib.rs
index 5323c2e2..96cbfcf8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -201,6 +201,7 @@ pub mod learning {
         pub mod cost_fn;
         pub mod rand_utils;
         pub mod regularization;
+        pub mod net_layer;
     }
 }
 

From d1399998fed26807f7d0d94e6d28fa24dac47353 Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Thu, 8 Sep 2016 12:47:13 -0400
Subject: [PATCH 05/28] Added hyperbolic tangent activation function

---
 src/learning/toolkit/activ_fn.rs | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/learning/toolkit/activ_fn.rs b/src/learning/toolkit/activ_fn.rs
index 0ffe4569..8f1ef03c 100644
--- a/src/learning/toolkit/activ_fn.rs
+++ b/src/learning/toolkit/activ_fn.rs
@@ -79,3 +79,22 @@ impl ActivationFunc for Exp {
         x.ln()
     }
 }
+
+/// Hyperbolic tangent activation function
+#[derive(Clone, Copy, Debug)]
+pub struct Tanh;
+
+impl ActivationFunc for Tanh {
+    fn func(x: f64) -> f64 {
+        x.tanh()
+    }
+
+    fn func_grad(x: f64) -> f64 {
+        let y = x.tanh();
+        1.0 - y*y
+    }
+
+    fn func_inv(x: f64) -> f64 {
+        0.5*((1.0+x)/(1.0-x)).ln()
+    }
+}
\ No newline at end of file

From ff998317e8d599b5059b70a292ee3d75c2701a97 Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Thu, 8 Sep 2016 20:46:02 -0400
Subject: [PATCH 06/28] Update neural net implementation

---
 examples/nnet-and_gate.rs         |   3 +-
 src/learning/nnet.rs              | 284 +++++++++++++++++-------------
 src/learning/toolkit/activ_fn.rs  |   4 +-
 src/learning/toolkit/net_layer.rs |  66 ++++++-
 4 files changed, 221 insertions(+), 136 deletions(-)

diff --git a/examples/nnet-and_gate.rs b/examples/nnet-and_gate.rs
index bf319bf1..6d52145a 100644
--- a/examples/nnet-and_gate.rs
+++ b/examples/nnet-and_gate.rs
@@ -6,6 +6,7 @@ use std::vec::Vec;
 
 use rusty_machine::learning::nnet::{NeuralNet, BCECriterion};
 use rusty_machine::learning::toolkit::regularization::Regularization;
+use rusty_machine::learning::toolkit::activ_fn::Sigmoid;
 use rusty_machine::learning::optim::grad_desc::StochasticGD;
 
 use rusty_machine::linalg::Matrix;
@@ -41,7 +42,7 @@ fn main() {
 
     let layers = &[2, 1];
     let criterion = BCECriterion::new(Regularization::L2(0.));
-    let mut model = NeuralNet::new(layers, criterion, StochasticGD::default());
+    let mut model = NeuralNet::mlp(layers, criterion, StochasticGD::default(), Sigmoid);
 
     println!("Training...");
     model.train(&inputs, &targets);
diff --git a/src/learning/nnet.rs b/src/learning/nnet.rs
index a55484c1..4029c1de 100644
--- a/src/learning/nnet.rs
+++ b/src/learning/nnet.rs
@@ -7,6 +7,7 @@
 //! ```
 //! use rusty_machine::learning::nnet::{NeuralNet, BCECriterion};
 //! use rusty_machine::learning::toolkit::regularization::Regularization;
+//! use rusty_machine::learning::toolkit::activ_fn::Sigmoid;
 //! use rusty_machine::learning::optim::grad_desc::StochasticGD;
 //! use rusty_machine::linalg::Matrix;
 //! use rusty_machine::learning::SupModel;
@@ -23,7 +24,7 @@
 //! let criterion = BCECriterion::new(Regularization::L2(0.1));
 //!
 //! // We will just use the default stochastic gradient descent.
-//! let mut model = NeuralNet::new(layers, criterion, StochasticGD::default());
+//! let mut model = NeuralNet::mlp(layers, criterion, StochasticGD::default(), Sigmoid);
 //!
 //! // Train the model!
 //! model.train(&inputs, &targets);
@@ -58,25 +59,27 @@ use learning::optim::grad_desc::StochasticGD;
 use rand::thread_rng;
 use rand::distributions::{Sample, range};
 
+use std::fmt::Debug;
+
 /// Neural Network Model
 ///
 /// The Neural Network struct specifies a Criterion and
 /// a gradient descent algorithm.
 #[derive(Debug)]
-pub struct NeuralNet<'a, T, A>
+pub struct NeuralNet<T, A>
     where T: Criterion,
-          A: OptimAlgorithm<BaseNeuralNet<'a, T>>
+          A: OptimAlgorithm<BaseNeuralNet<T>>
 {
-    base: BaseNeuralNet<'a, T>,
+    base: BaseNeuralNet<T>,
     alg: A,
 }
 
 /// Supervised learning for the Neural Network.
 ///
 /// The model is trained using back propagation.
-impl<'a, T, A> SupModel<Matrix<f64>, Matrix<f64>> for NeuralNet<'a, T, A>
+impl<T, A> SupModel<Matrix<f64>, Matrix<f64>> for NeuralNet<T, A>
     where T: Criterion,
-          A: OptimAlgorithm<BaseNeuralNet<'a, T>>
+          A: OptimAlgorithm<BaseNeuralNet<T>>
 {
     /// Predict neural network output using forward propagation.
     fn predict(&self, inputs: &Matrix<f64>) -> Matrix<f64> {
@@ -90,7 +93,7 @@ impl<'a, T, A> SupModel<Matrix<f64>, Matrix<f64>> for NeuralNet<'a, T, A>
     }
 }
 
-impl<'a> NeuralNet<'a, BCECriterion, StochasticGD> {
+impl NeuralNet<BCECriterion, StochasticGD> {
     /// Creates a neural network with the specified layer sizes.
     ///
     /// The layer sizes slice should include the input, hidden layers, and output layer sizes.
@@ -109,17 +112,36 @@ impl<'a> NeuralNet<'a, BCECriterion, StochasticGD> {
     /// ```
     pub fn default(layer_sizes: &[usize]) -> NeuralNet<BCECriterion, StochasticGD> {
         NeuralNet {
-            base: BaseNeuralNet::default(layer_sizes),
+            base: BaseNeuralNet::default(layer_sizes, activ_fn::Sigmoid),
             alg: StochasticGD::default(),
         }
     }
 }
 
-impl<'a, T, A> NeuralNet<'a, T, A>
+impl<T, A> NeuralNet<T, A>
     where T: Criterion,
-          A: OptimAlgorithm<BaseNeuralNet<'a, T>>
+          A: OptimAlgorithm<BaseNeuralNet<T>>
 {
-    /// Create a new neural network with the specified layer sizes.
+    /// Create a new neural network with no layers
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use rusty_machine::learning::nnet::BCECriterion;
+    /// use rusty_machine::learning::nnet::NeuralNet;
+    /// use rusty_machine::learning::optim::grad_desc::StochasticGD;
+    ///
+    /// // Create a an empty neural net
+    /// let mut net = NeuralNet::new(BCECriterion::default(), StochasticGD::default());
+    /// ```
+    pub fn new(criterion: T, alg: A) -> NeuralNet<T, A> {
+        NeuralNet {
+            base: BaseNeuralNet::new(criterion),
+            alg: alg,
+        }
+    }
+
+    /// Create a multilayer perceptron with the specified layer sizes.
     ///
     /// The layer sizes slice should include the input, hidden layers, and output layer sizes.
     /// The type of activation function must be specified.
@@ -131,19 +153,44 @@ impl<'a, T, A> NeuralNet<'a, T, A>
     /// ```
     /// use rusty_machine::learning::nnet::BCECriterion;
     /// use rusty_machine::learning::nnet::NeuralNet;
+    /// use rusty_machine::learning::toolkit::activ_fn::Sigmoid;
     /// use rusty_machine::learning::optim::grad_desc::StochasticGD;
     ///
     /// // Create a neural net with 4 layers, 3 neurons in each.
     /// let layers = &[3; 4];
-    /// let mut net = NeuralNet::new(layers, BCECriterion::default(), StochasticGD::default());
+    /// let mut net = NeuralNet::mlp(layers, BCECriterion::default(), StochasticGD::default(), Sigmoid);
     /// ```
-    pub fn new(layer_sizes: &'a [usize], criterion: T, alg: A) -> NeuralNet<'a, T, A> {
+    pub fn mlp<U>(layer_sizes: &[usize], criterion: T, alg: A, activ_fn: U) -> NeuralNet<T, A> 
+        where U: ActivationFunc + 'static {
         NeuralNet {
-            base: BaseNeuralNet::new(layer_sizes, criterion),
+            base: BaseNeuralNet::mlp(layer_sizes, criterion, activ_fn),
             alg: alg,
         }
     }
 
+    /// Adds the specified layer to the end of the network
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use rusty_machine::linalg::BaseSlice;
+    /// use rusty_machine::learning::nnet::BCECriterion;
+    /// use rusty_machine::learning::nnet::NeuralNet;
+    /// use rusty_machine::learning::optim::grad_desc::StochasticGD;
+    /// use rusty_machine::learning::toolkit::net_layer::Linear;
+    ///
+    /// // Create a new neural net 
+    /// let mut net = NeuralNet::new(BCECriterion::default(), StochasticGD::default());
+    ///
+    /// // Give net an input layer of size 3, hidden layer of size 4, and output layer of size 5
+    /// net.add_layer(Box::new(Linear::new(3, 4)));
+    /// net.add_layer(Box::new(Linear::new(4, 5)));
+    /// ```
+    pub fn add_layer<'a>(&'a mut self, layer: Box<NetLayer>) -> &'a mut NeuralNet<T, A> {
+        self.base.add_layer(layer);
+        self
+    }
+
     /// Gets matrix of weights between specified layer and forward layer.
     ///
     /// # Examples
@@ -156,7 +203,7 @@ impl<'a, T, A> NeuralNet<'a, T, A>
     /// let layers = &[3; 4];
     /// let mut net = NeuralNet::default(layers);
     ///
-    /// let w = &net.get_net_weights(2);
+    /// let w = &net.get_net_weights(1);
     ///
     /// // We add a bias term to the weight matrix
     /// assert_eq!(w.rows(), 4);
@@ -171,29 +218,49 @@ impl<'a, T, A> NeuralNet<'a, T, A>
 ///
 /// This struct cannot be instantianated and is used internally only.
 #[derive(Debug)]
-pub struct BaseNeuralNet<'a, T: Criterion> {
-    layer_sizes: &'a [usize],
+pub struct BaseNeuralNet<T: Criterion> {
+    layers: Vec<Box<NetLayer>>,
     weights: Vec<f64>,
     criterion: T,
 }
 
 
-impl<'a> BaseNeuralNet<'a, BCECriterion> {
+impl BaseNeuralNet<BCECriterion> {
     /// Creates a base neural network with the specified layer sizes.
-    fn default(layer_sizes: &[usize]) -> BaseNeuralNet<BCECriterion> {
-        BaseNeuralNet::new(layer_sizes, BCECriterion::default())
+    fn default<U>(layer_sizes: &[usize], activ_fn: U) -> BaseNeuralNet<BCECriterion>
+        where U: ActivationFunc + Debug + 'static {
+        BaseNeuralNet::mlp(layer_sizes, BCECriterion::default(), activ_fn)
     }
 }
 
 
-impl<'a, T: Criterion> BaseNeuralNet<'a, T> {
-    /// Create a new base neural network with the specified layer sizes.
-    fn new(layer_sizes: &[usize], criterion: T) -> BaseNeuralNet<T> {
+impl<T: Criterion> BaseNeuralNet<T> {
+    /// Create a base neural network with no layers
+    fn new(criterion: T) -> BaseNeuralNet<T> {
         BaseNeuralNet {
-            layer_sizes: layer_sizes,
-            weights: BaseNeuralNet::<T>::create_weights(layer_sizes),
-            criterion: criterion,
+            layers: Vec::new(),
+            weights: Vec::new(),
+            criterion: criterion
+        }
+    }
+
+    /// Create a multilayer perceptron with the specified layer sizes.
+    fn mlp<'a, U>(layer_sizes: &[usize], criterion: T, activ_fn: U) -> BaseNeuralNet<T> 
+        where U: ActivationFunc + 'static {
+        let mut mlp = BaseNeuralNet::new(criterion);
+        for shape in layer_sizes.windows(2) {
+            mlp.add_layer(Box::new(net_layer::Bias));
+            mlp.add_layer(Box::new(net_layer::Linear::new(shape[0]+1, shape[1])));
+            mlp.add_layer(Box::new(activ_fn.clone()));
         }
+        mlp
+    }
+
+    /// Adds the specified layer to the end of the network
+    fn add_layer<'a>(&'a mut self, layer: Box<NetLayer>) -> &'a mut BaseNeuralNet<T> {
+        self.weights.append(&mut layer.default_params());
+        self.layers.push(layer);
+        self
     }
 
     /// Creates initial weights for all neurons in the network.
@@ -212,31 +279,30 @@ impl<'a, T: Criterion> BaseNeuralNet<'a, T> {
             }).collect()
     }
 
-    /// Gets matrix of weights between specified layer and forward layer for the weights.
+    /// Gets matrix of weights for the specified layer for the weights.
     fn get_layer_weights(&self, weights: &[f64], idx: usize) -> MatrixSlice<f64> {
-        debug_assert!(idx < self.layer_sizes.len() - 1);
+        debug_assert!(idx < self.layers.len());
 
         // Check that the weights are the right size.
         let mut full_size = 0usize;
-        for l in 0..self.layer_sizes.len() - 1 {
-            full_size += (self.layer_sizes[l] + 1) * self.layer_sizes[l + 1];
+        for l in &self.layers {
+            full_size += l.num_params();
         }
 
         debug_assert_eq!(full_size, weights.len());
 
         let mut start = 0usize;
-
-        for l in 0..idx {
-            start += (self.layer_sizes[l] + 1) * self.layer_sizes[l + 1]
+        for l in &self.layers[..idx] {
+            start += l.num_params();
         }
 
+        let shape = self.layers[idx].param_shape();
         unsafe {
             MatrixSlice::from_raw_parts(weights.as_ptr().offset(start as isize),
-                                        self.layer_sizes[idx] + 1,
-                                        self.layer_sizes[idx + 1],
-                                        self.layer_sizes[idx + 1])
+                                        shape.0,
+                                        shape.1,
+                                        shape.1)
         }
-
     }
 
     /// Gets matrix of weights between specified layer and forward layer
@@ -257,115 +323,83 @@ impl<'a, T: Criterion> BaseNeuralNet<'a, T> {
                     inputs: &Matrix<f64>,
                     targets: &Matrix<f64>)
                     -> (f64, Vec<f64>) {
-        assert_eq!(inputs.cols(), self.layer_sizes[0]);
-
-        let mut forward_weights = Vec::with_capacity(self.layer_sizes.len() - 1);
-        let mut activations = Vec::with_capacity(self.layer_sizes.len());
-
-        let net_data = Matrix::ones(inputs.rows(), 1).hcat(inputs);
-
-        activations.push(net_data.clone());
+        let mut gradients = Vec::with_capacity(weights.len());
+        unsafe {
+            gradients.set_len(weights.len());
+        }
+        //activations[0] is input and activations[i+1] is output of layer[i]
+        let mut activations = Vec::with_capacity(self.layers.len()+1);
 
         // Forward propagation
-        {
-            let mut z = net_data * self.get_layer_weights(weights, 0);
-            forward_weights.push(z.clone());
-
-            for l in 1..self.layer_sizes.len() - 1 {
-                let mut a = self.criterion.activate(z.clone());
-                let ones = Matrix::ones(a.rows(), 1);
-
-                a = ones.hcat(&a);
-
-                z = &a * self.get_layer_weights(weights, l);
-                activations.push(a);
-                forward_weights.push(z.clone());
-            }
-
-            activations.push(self.criterion.activate(z));
+        
+        let mut index = 0;
+        activations.push(inputs.clone());
+        for layer in &self.layers {
+            let shape = layer.param_shape();
+
+            let slice = unsafe {
+                MatrixSlice::from_raw_parts(weights.as_ptr().offset(index as isize),
+                                            shape.0,
+                                            shape.1,
+                                            shape.1)
+            };
+
+            let output = layer.forward(activations.last().unwrap(), slice);
+            activations.push(output);
+            index += layer.num_params();
         }
+        let output = &activations.last().unwrap();
 
-        let mut deltas = Vec::with_capacity(self.layer_sizes.len() - 1);
         // Backward propagation
-        {
-            let z = forward_weights[self.layer_sizes.len() - 2].clone();
-            let g = self.criterion.grad_activ(z);
-
-            // Take GRAD_cost to compute this delta.
-            let mut delta = self.criterion
-                .cost_grad(&activations[self.layer_sizes.len() - 1], targets)
-                .elemul(&g);
-
-            deltas.push(delta.clone());
-
-            for l in (1..self.layer_sizes.len() - 1).rev() {
-                let mut z = forward_weights[l - 1].clone();
-                let ones = Matrix::ones(z.rows(), 1);
-                z = ones.hcat(&z);
-
-                let g = self.criterion.grad_activ(z);
-                delta = (delta * Matrix::from(self.get_layer_weights(weights, l)).transpose())
-                    .elemul(&g);
-
-                let non_one_rows = &(1..delta.cols()).collect::<Vec<usize>>()[..];
-                delta = delta.select_cols(non_one_rows);
-                deltas.push(delta.clone());
-            }
-        }
 
-        let mut gradients = Vec::with_capacity(weights.len());
+        //The gradient with respect to the current layer's output
+        let mut out_grad = self.criterion.cost_grad(output, targets);
+        // at this point index == weights.len()
+        for (i, layer) in self.layers.iter().enumerate().rev() {
+            index -= layer.num_params();
+            let shape = layer.param_shape();
 
-        for (l, activ_item) in activations.iter().take(self.layer_sizes.len() - 1).enumerate() {
-            // Compute the gradient
-            let mut g = deltas[self.layer_sizes.len() - 2 - l].transpose() * activ_item;
+            let slice = unsafe {
+                MatrixSlice::from_raw_parts(weights.as_ptr().offset(index as isize),
+                                            shape.0,
+                                            shape.1,
+                                            shape.1)
+            };
 
-            // Add the regularized gradient
-            if self.criterion.is_regularized() {
-                let layer = l;
-                let non_bias_weights = self.get_non_bias_weights(weights, layer);
-                let zeros = Matrix::zeros(1, non_bias_weights.cols());
-                g += zeros.vcat(&self.criterion.reg_cost_grad(non_bias_weights));
-            }
+            let grad_params = layer.back_params(&out_grad, &activations[i], slice);
+            out_grad = layer.back_input(&out_grad, &activations[i], slice);
 
-            gradients.append(&mut (g / inputs.rows() as f64).into_vec());
-        }
-
-        // Compute the cost
-        let mut cost = self.criterion.cost(&activations[activations.len() - 1], targets);
-
-        // Add the regularized cost
-        if self.criterion.is_regularized() {
-            for i in 0..self.layer_sizes.len() - 1 {
-                cost += self.criterion.reg_cost(self.get_non_bias_weights(weights, i));
-            }
+            gradients[index..index+layer.num_params()].copy_from_slice(&grad_params.data());
         }
 
+        let cost = self.criterion.cost(output, targets);
         (cost, gradients)
     }
 
     /// Forward propagation of the model weights to get the outputs.
     fn forward_prop(&self, inputs: &Matrix<f64>) -> Matrix<f64> {
-        assert_eq!(inputs.cols(), self.layer_sizes[0]);
-
-        let net_data = Matrix::ones(inputs.rows(), 1).hcat(inputs);
-
-        let mut z = net_data * self.get_net_weights(0);
-        let mut a = self.criterion.activate(z.clone());
-
-        for l in 1..self.layer_sizes.len() - 1 {
-            let ones = Matrix::ones(a.rows(), 1);
-            a = ones.hcat(&a);
-            z = a * self.get_net_weights(l);
-            a = self.criterion.activate(z.clone());
+        let mut index = 0;
+        let mut outputs = inputs.clone();
+        for layer in &self.layers {
+            let shape = layer.param_shape();
+
+            let slice = unsafe {
+                MatrixSlice::from_raw_parts(self.weights.as_ptr().offset(index as isize),
+                                            shape.0,
+                                            shape.1,
+                                            shape.1)
+            };
+
+            outputs = layer.forward(&outputs, slice);
+            index += layer.num_params();
         }
-
-        a
+        outputs
     }
 }
 
 /// Compute the gradient of the Neural Network using the
 /// back propagation algorithm.
-impl<'a, T: Criterion> Optimizable for BaseNeuralNet<'a, T> {
+impl<T: Criterion> Optimizable for BaseNeuralNet<T> {
     type Inputs = Matrix<f64>;
     type Targets = Matrix<f64>;
 
@@ -384,7 +418,7 @@ impl<'a, T: Criterion> Optimizable for BaseNeuralNet<'a, T> {
 /// Specifies an activation function and a cost function.
 pub trait Criterion {
     /// The activation function for the criterion.
-    type ActFunc: ActivationFunc;
+    type ActFunc: ActivationFunc + Debug;
     /// The cost function for the criterion.
     type Cost: CostFunc<Matrix<f64>>;
 
diff --git a/src/learning/toolkit/activ_fn.rs b/src/learning/toolkit/activ_fn.rs
index 8f1ef03c..e8e76fc1 100644
--- a/src/learning/toolkit/activ_fn.rs
+++ b/src/learning/toolkit/activ_fn.rs
@@ -8,8 +8,10 @@
 //! You can also create your own custom activation Functions for use in your models.
 //! Just create a unit struct implementing the `ActivationFunc` trait.
 
+use std::fmt::Debug;
+
 /// Trait for activation functions in models.
-pub trait ActivationFunc {
+pub trait ActivationFunc: Clone + Debug {
     /// The activation function.
     fn func(x: f64) -> f64;
 
diff --git a/src/learning/toolkit/net_layer.rs b/src/learning/toolkit/net_layer.rs
index f6a5184d..8a2aaa74 100644
--- a/src/learning/toolkit/net_layer.rs
+++ b/src/learning/toolkit/net_layer.rs
@@ -9,13 +9,18 @@ use rand::thread_rng;
 use rand::distributions::Sample;
 use rand::distributions::normal::Normal;
 
+use std::fmt::Debug;
+
 /// Trait for neural net layers
-pub trait NetLayer {
+pub trait NetLayer : Debug {
 	/// The result of propogating data forward through this layer
 	fn forward(&self, input: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64>;
 
 	/// The gradient of the output of this layer with respect to its input
-	fn backward(&self, out_grad: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64>;
+	fn back_input(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64>;
+	
+	/// The gradient of the output of this layer with respect to its parameters
+	fn back_params(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64>;
 
 	/// The default value of the parameters of this layer before training
 	fn default_params(&self) -> Vec<f64>;
@@ -56,9 +61,14 @@ impl NetLayer for Linear {
 		input * &params
 	}
 
-	fn backward(&self, out_grad: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64> {
+	fn back_input(&self, out_grad: &Matrix<f64>, _: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64> {
 		assert_eq!(out_grad.cols(), params.cols());
-		out_grad * params.into_matrix().transpose()
+		out_grad * &params.into_matrix().transpose()
+	}
+	
+	fn back_params(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
+		assert_eq!(input.rows(), out_grad.rows());
+		input.transpose() * out_grad
 	}
 
 	/// Initializes weights using Xavier initialization
@@ -73,7 +83,7 @@ impl NetLayer for Linear {
 	}
 
 	fn num_params(&self) -> usize {
-		self.output_size * self.input_size
+		self.input_size * self.output_size
 	}
 
 	fn param_shape(&self) -> (usize, usize) {
@@ -81,18 +91,56 @@ impl NetLayer for Linear {
 	}
 }
 
-impl<T: ActivationFunc> NetLayer for T {
+/// Bias layer
+///
+/// Adds a constant 1. to the end of each input
+/// Allows a linear to act like it has a bias term, for example
+#[derive(Debug, Clone, Copy)]
+pub struct Bias;
+
+impl NetLayer for Bias {
+	fn forward(&self, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
+		input.hcat(&Matrix::<f64>::ones(input.rows(), 1))
+	}
+
+	fn back_input(&self, out_grad: &Matrix<f64>, _: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
+		let columns: Vec<_> = (0..out_grad.cols()-1).collect();
+		out_grad.select_cols(&columns)
+	}
+	
+	fn back_params(&self, _: &Matrix<f64>, _: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
+		Matrix::new(0, 0, Vec::new())
+	}
+
+	fn default_params(&self) -> Vec<f64> {
+		Vec::new()
+	}
+
+	fn num_params(&self) -> usize {
+		0
+	}
+
+	fn param_shape(&self) -> (usize, usize) {
+		(0, 0)
+	}
+}
+
+impl<T: ActivationFunc + Debug> NetLayer for T {
 	/// Applys the activation function to each element of the input
 	fn forward(&self, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
 		input.clone().apply(&T::func)
 	}
 
-	fn backward(&self, out_grad: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
-		out_grad.clone().apply(&T::func_grad)
+	fn back_input(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
+		out_grad.elemul(&input.clone().apply(&T::func_grad))
+	}
+	
+	fn back_params(&self, _: &Matrix<f64>, _: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
+		Matrix::new(0, 0, Vec::new())
 	}
 
 	fn default_params(&self) -> Vec<f64> {
-		vec![]
+		Vec::new()
 	}
 
 	fn num_params(&self) -> usize {

From 8cbe7424271798e766c959deabec89155b5906a9 Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Fri, 9 Sep 2016 16:13:32 -0400
Subject: [PATCH 07/28] Combined Bias and Linear layers

---
 src/learning/nnet.rs              | 28 +++++++----
 src/learning/toolkit/net_layer.rs | 84 +++++++++++++++----------------
 2 files changed, 60 insertions(+), 52 deletions(-)

diff --git a/src/learning/nnet.rs b/src/learning/nnet.rs
index 4029c1de..0f2ea7d6 100644
--- a/src/learning/nnet.rs
+++ b/src/learning/nnet.rs
@@ -183,8 +183,8 @@ impl<T, A> NeuralNet<T, A>
     /// let mut net = NeuralNet::new(BCECriterion::default(), StochasticGD::default());
     ///
     /// // Give net an input layer of size 3, hidden layer of size 4, and output layer of size 5
-    /// net.add_layer(Box::new(Linear::new(3, 4)));
-    /// net.add_layer(Box::new(Linear::new(4, 5)));
+    /// net.add_layer(Box::new(Linear::default(3, 4)));
+    /// net.add_layer(Box::new(Linear::default(4, 5)));
     /// ```
     pub fn add_layer<'a>(&'a mut self, layer: Box<NetLayer>) -> &'a mut NeuralNet<T, A> {
         self.base.add_layer(layer);
@@ -203,7 +203,7 @@ impl<T, A> NeuralNet<T, A>
     /// let layers = &[3; 4];
     /// let mut net = NeuralNet::default(layers);
     ///
-    /// let w = &net.get_net_weights(1);
+    /// let w = &net.get_net_weights(2);
     ///
     /// // We add a bias term to the weight matrix
     /// assert_eq!(w.rows(), 4);
@@ -249,8 +249,7 @@ impl<T: Criterion> BaseNeuralNet<T> {
         where U: ActivationFunc + 'static {
         let mut mlp = BaseNeuralNet::new(criterion);
         for shape in layer_sizes.windows(2) {
-            mlp.add_layer(Box::new(net_layer::Bias));
-            mlp.add_layer(Box::new(net_layer::Linear::new(shape[0]+1, shape[1])));
+            mlp.add_layer(Box::new(net_layer::Linear::default(shape[0], shape[1])));
             mlp.add_layer(Box::new(activ_fn.clone()));
         }
         mlp
@@ -294,7 +293,7 @@ impl<T: Criterion> BaseNeuralNet<T> {
         let mut start = 0usize;
         for l in &self.layers[..idx] {
             start += l.num_params();
-        }
+        } 
 
         let shape = self.layers[idx].param_shape();
         unsafe {
@@ -348,7 +347,7 @@ impl<T: Criterion> BaseNeuralNet<T> {
             activations.push(output);
             index += layer.num_params();
         }
-        let output = &activations.last().unwrap();
+        let output = activations.last().unwrap();
 
         // Backward propagation
 
@@ -379,8 +378,19 @@ impl<T: Criterion> BaseNeuralNet<T> {
     /// Forward propagation of the model weights to get the outputs.
     fn forward_prop(&self, inputs: &Matrix<f64>) -> Matrix<f64> {
         let mut index = 0;
-        let mut outputs = inputs.clone();
-        for layer in &self.layers {
+        if self.layers.len() == 0 {
+            return inputs.clone();
+        }
+
+        let mut outputs = unsafe {
+            let shape = self.layers[0].param_shape();
+            let slice = MatrixSlice::from_raw_parts(self.weights.as_ptr(),
+                                                    shape.0,
+                                                    shape.1,
+                                                    shape.1);
+            self.layers[0].forward(inputs, slice)
+        };
+        for layer in self.layers.iter().skip(1) {
             let shape = layer.param_shape();
 
             let slice = unsafe {
diff --git a/src/learning/toolkit/net_layer.rs b/src/learning/toolkit/net_layer.rs
index 8a2aaa74..b554c112 100644
--- a/src/learning/toolkit/net_layer.rs
+++ b/src/learning/toolkit/net_layer.rs
@@ -34,7 +34,9 @@ pub trait NetLayer : Debug {
 
 /// Linear network layer
 ///
-/// The parameters are a matrix of weights of size O x I
+/// Represents a fully connected layer with optional bias term
+///
+/// The parameters are a matrix of weights of size I x O
 /// where O is the dimensionality of the output and I the dimensionality of the input
 #[derive(Debug, Clone, Copy)]
 pub struct Linear {
@@ -42,12 +44,27 @@ pub struct Linear {
 	input_size: usize,
 	/// The number of dimensions of the output
 	output_size: usize,
+	/// Whether or not to include a bias term
+	has_bias: bool,
 }
 
 impl Linear {
-	/// Construct a Linear layer
-	pub fn new(input_size: usize, output_size: usize) -> Linear {
-		Linear{input_size: input_size, output_size: output_size}
+	/// Construct a new Linear layer
+	pub fn new(input_size: usize, output_size: usize, has_bias: bool) -> Linear {
+		Linear {
+			input_size: input_size + has_bias as usize, 
+			output_size: output_size,
+			has_bias: has_bias
+		}
+	}
+
+	/// Construct a Linear layer with a bias term
+	pub fn default(input_size: usize, output_size: usize) -> Linear {
+		Linear {
+			input_size: input_size + 1, 
+			output_size: output_size,
+			has_bias: true
+		}
 	}
 }
 
@@ -57,23 +74,38 @@ impl NetLayer for Linear {
 	/// input should have dimensions N x I
 	/// where N is the number of samples and I is the dimensionality of the input
 	fn forward(&self, input: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64> {
-		assert_eq!(input.cols(), params.rows());
-		input * &params
+		if self.has_bias {
+			assert_eq!(input.cols()+1, params.rows());
+			input.hcat(&Matrix::<f64>::ones(input.rows(), 1)) * &params
+		} else {
+			assert_eq!(input.cols(), params.rows());
+			input * &params
+		}
 	}
 
 	fn back_input(&self, out_grad: &Matrix<f64>, _: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64> {
 		assert_eq!(out_grad.cols(), params.cols());
-		out_grad * &params.into_matrix().transpose()
+		let gradient = out_grad * &params.into_matrix().transpose();
+		if self.has_bias {
+			let columns: Vec<_> = (0..gradient.cols()-1).collect();
+			gradient.select_cols(&columns)
+		} else {
+			gradient
+		}
 	}
 	
 	fn back_params(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
 		assert_eq!(input.rows(), out_grad.rows());
-		input.transpose() * out_grad
+		if self.has_bias {
+			input.transpose().vcat(&Matrix::<f64>::ones(1, input.rows())) * out_grad
+		} else {
+			input.transpose() * out_grad
+		}
 	}
 
 	/// Initializes weights using Xavier initialization
 	///
-	/// weights draw from gaussian distribution with 0 mean and variance 2/(input_size+output_size)
+	/// weights drawn from gaussian distribution with 0 mean and variance 2/(input_size+output_size)
 	fn default_params(&self) -> Vec<f64> {
 		let mut distro = Normal::new(0.0, (2.0/(self.input_size+self.output_size) as f64).sqrt());
 		let mut rng = thread_rng();
@@ -91,40 +123,6 @@ impl NetLayer for Linear {
 	}
 }
 
-/// Bias layer
-///
-/// Adds a constant 1. to the end of each input
-/// Allows a linear to act like it has a bias term, for example
-#[derive(Debug, Clone, Copy)]
-pub struct Bias;
-
-impl NetLayer for Bias {
-	fn forward(&self, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
-		input.hcat(&Matrix::<f64>::ones(input.rows(), 1))
-	}
-
-	fn back_input(&self, out_grad: &Matrix<f64>, _: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
-		let columns: Vec<_> = (0..out_grad.cols()-1).collect();
-		out_grad.select_cols(&columns)
-	}
-	
-	fn back_params(&self, _: &Matrix<f64>, _: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
-		Matrix::new(0, 0, Vec::new())
-	}
-
-	fn default_params(&self) -> Vec<f64> {
-		Vec::new()
-	}
-
-	fn num_params(&self) -> usize {
-		0
-	}
-
-	fn param_shape(&self) -> (usize, usize) {
-		(0, 0)
-	}
-}
-
 impl<T: ActivationFunc + Debug> NetLayer for T {
 	/// Applys the activation function to each element of the input
 	fn forward(&self, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {

From e19c427aa8217e54f5ac1d93a617c2835a0d8c86 Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Fri, 9 Sep 2016 16:31:52 -0400
Subject: [PATCH 08/28] Replace Linear::{new, default} with
 Linear::with{out}_bias

---
 src/learning/nnet.rs              |  6 +++---
 src/learning/toolkit/net_layer.rs | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/learning/nnet.rs b/src/learning/nnet.rs
index 0f2ea7d6..a26d034f 100644
--- a/src/learning/nnet.rs
+++ b/src/learning/nnet.rs
@@ -183,8 +183,8 @@ impl<T, A> NeuralNet<T, A>
     /// let mut net = NeuralNet::new(BCECriterion::default(), StochasticGD::default());
     ///
     /// // Give net an input layer of size 3, hidden layer of size 4, and output layer of size 5
-    /// net.add_layer(Box::new(Linear::default(3, 4)));
-    /// net.add_layer(Box::new(Linear::default(4, 5)));
+    /// net.add_layer(Box::new(Linear::with_bias(3, 4)));
+    /// net.add_layer(Box::new(Linear::with_bias(4, 5)));
     /// ```
     pub fn add_layer<'a>(&'a mut self, layer: Box<NetLayer>) -> &'a mut NeuralNet<T, A> {
         self.base.add_layer(layer);
@@ -249,7 +249,7 @@ impl<T: Criterion> BaseNeuralNet<T> {
         where U: ActivationFunc + 'static {
         let mut mlp = BaseNeuralNet::new(criterion);
         for shape in layer_sizes.windows(2) {
-            mlp.add_layer(Box::new(net_layer::Linear::default(shape[0], shape[1])));
+            mlp.add_layer(Box::new(net_layer::Linear::with_bias(shape[0], shape[1])));
             mlp.add_layer(Box::new(activ_fn.clone()));
         }
         mlp
diff --git a/src/learning/toolkit/net_layer.rs b/src/learning/toolkit/net_layer.rs
index b554c112..37957491 100644
--- a/src/learning/toolkit/net_layer.rs
+++ b/src/learning/toolkit/net_layer.rs
@@ -50,20 +50,20 @@ pub struct Linear {
 
 impl Linear {
 	/// Construct a new Linear layer
-	pub fn new(input_size: usize, output_size: usize, has_bias: bool) -> Linear {
+	pub fn with_bias(input_size: usize, output_size: usize) -> Linear {
 		Linear {
-			input_size: input_size + has_bias as usize, 
+			input_size: input_size + 1, 
 			output_size: output_size,
-			has_bias: has_bias
+			has_bias: true
 		}
 	}
 
 	/// Construct a Linear layer with a bias term
-	pub fn default(input_size: usize, output_size: usize) -> Linear {
+	pub fn without_bias(input_size: usize, output_size: usize) -> Linear {
 		Linear {
-			input_size: input_size + 1, 
+			input_size: input_size, 
 			output_size: output_size,
-			has_bias: true
+			has_bias: false
 		}
 	}
 }

From 40dcdcc954fde33feaf0f826895b9d173b116fb0 Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Wed, 21 Sep 2016 16:07:39 -0700
Subject: [PATCH 09/28] Expanded nnet module

---
 examples/nnet-and_gate.rs                   |  2 ++
 src/learning/{nnet.rs => nnet/mod.rs}       | 20 ++++++++++++--------
 src/learning/{toolkit => nnet}/net_layer.rs |  2 +-
 src/lib.rs                                  |  1 -
 4 files changed, 15 insertions(+), 10 deletions(-)
 rename src/learning/{nnet.rs => nnet/mod.rs} (97%)
 rename src/learning/{toolkit => nnet}/net_layer.rs (98%)

diff --git a/examples/nnet-and_gate.rs b/examples/nnet-and_gate.rs
index 6d52145a..d7dd90f0 100644
--- a/examples/nnet-and_gate.rs
+++ b/examples/nnet-and_gate.rs
@@ -42,6 +42,8 @@ fn main() {
 
     let layers = &[2, 1];
     let criterion = BCECriterion::new(Regularization::L2(0.));
+    // Create a multilayer perceptron with an input layer of size 2 and output layer of size 1
+    // Uses a Sigmoid activation function and uses Stochastic gradient descent for training
     let mut model = NeuralNet::mlp(layers, criterion, StochasticGD::default(), Sigmoid);
 
     println!("Training...");
diff --git a/src/learning/nnet.rs b/src/learning/nnet/mod.rs
similarity index 97%
rename from src/learning/nnet.rs
rename to src/learning/nnet/mod.rs
index a26d034f..a79efdfb 100644
--- a/src/learning/nnet.rs
+++ b/src/learning/nnet/mod.rs
@@ -23,7 +23,7 @@
 //! // Choose the BCE criterion with L2 regularization (`lambda=0.1`).
 //! let criterion = BCECriterion::new(Regularization::L2(0.1));
 //!
-//! // We will just use the default stochastic gradient descent.
+//! // We will create a multilayer perceptron and just use the default stochastic gradient descent.
 //! let mut model = NeuralNet::mlp(layers, criterion, StochasticGD::default(), Sigmoid);
 //!
 //! // Train the model!
@@ -42,6 +42,10 @@
 //! You can define your own criterion by implementing the `Criterion`
 //! trait with a concrete `ActivationFunc` and `CostFunc`.
 
+//! TODO: Add documentation
+
+pub mod net_layer;
+
 use linalg::{Matrix, MatrixSlice};
 use linalg::BaseSlice;
 
@@ -51,8 +55,6 @@ use learning::toolkit::activ_fn::ActivationFunc;
 use learning::toolkit::cost_fn;
 use learning::toolkit::cost_fn::CostFunc;
 use learning::toolkit::regularization::Regularization;
-use learning::toolkit::net_layer;
-use learning::toolkit::net_layer::NetLayer;
 use learning::optim::{Optimizable, OptimAlgorithm};
 use learning::optim::grad_desc::StochasticGD;
 
@@ -61,6 +63,8 @@ use rand::distributions::{Sample, range};
 
 use std::fmt::Debug;
 
+use self::net_layer::NetLayer;
+
 /// Neural Network Model
 ///
 /// The Neural Network struct specifies a Criterion and
@@ -176,15 +180,15 @@ impl<T, A> NeuralNet<T, A>
     /// use rusty_machine::linalg::BaseSlice;
     /// use rusty_machine::learning::nnet::BCECriterion;
     /// use rusty_machine::learning::nnet::NeuralNet;
+    /// use rusty_machine::learning::nnet::net_layer::Linear;
     /// use rusty_machine::learning::optim::grad_desc::StochasticGD;
-    /// use rusty_machine::learning::toolkit::net_layer::Linear;
     ///
     /// // Create a new neural net 
     /// let mut net = NeuralNet::new(BCECriterion::default(), StochasticGD::default());
     ///
     /// // Give net an input layer of size 3, hidden layer of size 4, and output layer of size 5
-    /// net.add_layer(Box::new(Linear::with_bias(3, 4)));
-    /// net.add_layer(Box::new(Linear::with_bias(4, 5)));
+    /// net.add_layer(Box::new(Linear::new(3, 4)))
+    ///    .add_layer(Box::new(Linear::new(4, 5)));
     /// ```
     pub fn add_layer<'a>(&'a mut self, layer: Box<NetLayer>) -> &'a mut NeuralNet<T, A> {
         self.base.add_layer(layer);
@@ -242,14 +246,14 @@ impl<T: Criterion> BaseNeuralNet<T> {
             weights: Vec::new(),
             criterion: criterion
         }
-    }
+    } 
 
     /// Create a multilayer perceptron with the specified layer sizes.
     fn mlp<'a, U>(layer_sizes: &[usize], criterion: T, activ_fn: U) -> BaseNeuralNet<T> 
         where U: ActivationFunc + 'static {
         let mut mlp = BaseNeuralNet::new(criterion);
         for shape in layer_sizes.windows(2) {
-            mlp.add_layer(Box::new(net_layer::Linear::with_bias(shape[0], shape[1])));
+            mlp.add_layer(Box::new(net_layer::Linear::new(shape[0], shape[1])));
             mlp.add_layer(Box::new(activ_fn.clone()));
         }
         mlp
diff --git a/src/learning/toolkit/net_layer.rs b/src/learning/nnet/net_layer.rs
similarity index 98%
rename from src/learning/toolkit/net_layer.rs
rename to src/learning/nnet/net_layer.rs
index 37957491..a52db7c1 100644
--- a/src/learning/toolkit/net_layer.rs
+++ b/src/learning/nnet/net_layer.rs
@@ -50,7 +50,7 @@ pub struct Linear {
 
 impl Linear {
 	/// Construct a new Linear layer
-	pub fn with_bias(input_size: usize, output_size: usize) -> Linear {
+	pub fn new(input_size: usize, output_size: usize) -> Linear {
 		Linear {
 			input_size: input_size + 1, 
 			output_size: output_size,
diff --git a/src/lib.rs b/src/lib.rs
index 96cbfcf8..5323c2e2 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -201,7 +201,6 @@ pub mod learning {
         pub mod cost_fn;
         pub mod rand_utils;
         pub mod regularization;
-        pub mod net_layer;
     }
 }
 

From 3588dc484a40767927655384e8983278594d388e Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Wed, 21 Sep 2016 17:42:07 -0700
Subject: [PATCH 10/28] Removed apply

---
 src/learning/nnet/mod.rs       | 31 ++++++++++++++++++++++++-------
 src/learning/nnet/net_layer.rs |  7 +++++--
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/src/learning/nnet/mod.rs b/src/learning/nnet/mod.rs
index 97e9a2dd..4e1a0fea 100644
--- a/src/learning/nnet/mod.rs
+++ b/src/learning/nnet/mod.rs
@@ -61,6 +61,7 @@ use rand::thread_rng;
 use rand::distributions::{Sample, range};
 
 use std::fmt::Debug;
+use std::iter::IntoIterator;
 
 use self::net_layer::NetLayer;
 
@@ -187,14 +188,21 @@ impl<T, A> NeuralNet<T, A>
     /// let mut net = NeuralNet::new(BCECriterion::default(), StochasticGD::default());
     ///
     /// // Give net an input layer of size 3, hidden layer of size 4, and output layer of size 5
-    /// net.add_layer(Box::new(Linear::new(3, 4)))
-    ///    .add_layer(Box::new(Linear::new(4, 5)));
+    /// net.add(Box::new(Linear::new(3, 4)))
+    ///    .add(Box::new(Linear::new(4, 5)));
     /// ```
-    pub fn add_layer<'a>(&'a mut self, layer: Box<NetLayer>) -> &'a mut NeuralNet<T, A> {
-        self.base.add_layer(layer);
+    pub fn add<'a>(&'a mut self, layer: Box<NetLayer>) -> &'a mut NeuralNet<T, A> {
+        self.base.add(layer);
         self
     }
 
+    /// Adds multiple layers to the end of the network
+    fn add_layers<'a, U>(&'a mut self, layers: U) -> &'a mut NeuralNet<T, A>
+    	where U: IntoIterator<Item = Box<NetLayer>> {
+    		self.base.add_layers(layers);
+    		self
+    }
+
     /// Gets matrix of weights between specified layer and forward layer.
     ///
     /// # Examples
@@ -253,19 +261,28 @@ impl<T: Criterion> BaseNeuralNet<T> {
         where U: ActivationFunc + 'static {
         let mut mlp = BaseNeuralNet::new(criterion);
         for shape in layer_sizes.windows(2) {
-            mlp.add_layer(Box::new(net_layer::Linear::new(shape[0], shape[1])));
-            mlp.add_layer(Box::new(activ_fn.clone()));
+            mlp.add(Box::new(net_layer::Linear::new(shape[0], shape[1])));
+            mlp.add(Box::new(activ_fn.clone()));
         }
         mlp
     }
 
     /// Adds the specified layer to the end of the network
-    fn add_layer<'a>(&'a mut self, layer: Box<NetLayer>) -> &'a mut BaseNeuralNet<T> {
+    fn add<'a>(&'a mut self, layer: Box<NetLayer>) -> &'a mut BaseNeuralNet<T> {
         self.weights.append(&mut layer.default_params());
         self.layers.push(layer);
         self
     }
 
+    /// Adds multiple layers to the end of the network
+    fn add_layers<'a, U>(&'a mut self, layers: U) -> &'a mut BaseNeuralNet<T>
+    	where U: IntoIterator<Item = Box<NetLayer>> {
+    		for layer in layers {
+    			self.add(layer);
+    		}
+    		self
+    }
+
     /// Creates initial weights for all neurons in the network.
     fn create_weights(layer_sizes: &[usize]) -> Vec<f64> {
         let mut between = range::Range::new(0f64, 1f64);
diff --git a/src/learning/nnet/net_layer.rs b/src/learning/nnet/net_layer.rs
index b5d71cc9..07b5a97c 100644
--- a/src/learning/nnet/net_layer.rs
+++ b/src/learning/nnet/net_layer.rs
@@ -125,11 +125,14 @@ impl NetLayer for Linear {
 impl<T: ActivationFunc + Debug> NetLayer for T {
 	/// Applys the activation function to each element of the input
 	fn forward(&self, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
-		input.clone().apply(&T::func)
+	    Matrix::new(input.rows(), input.cols(),
+	        input.iter().map(|&x| T::func(x)).collect::<Vec<_>>())
 	}
 
 	fn back_input(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
-		out_grad.elemul(&input.clone().apply(&T::func_grad))
+		let in_grad = Matrix::new(input.rows(), input.cols(),
+	        			input.iter().map(|&x| T::func_grad(x)).collect::<Vec<_>>());
+		out_grad.elemul(&in_grad)
 	}
 	
 	fn back_params(&self, _: &Matrix<f64>, _: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {

From ac7d31a7a90179933014f71cbf4d13be1a24e49f Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Wed, 21 Sep 2016 18:05:00 -0700
Subject: [PATCH 11/28] Removed clone from compute_grad

---
 src/learning/nnet/mod.rs | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/learning/nnet/mod.rs b/src/learning/nnet/mod.rs
index 4e1a0fea..ad003245 100644
--- a/src/learning/nnet/mod.rs
+++ b/src/learning/nnet/mod.rs
@@ -348,13 +348,12 @@ impl<T: Criterion> BaseNeuralNet<T> {
             gradients.set_len(weights.len());
         }
         //activations[0] is input and activations[i+1] is output of layer[i]
-        let mut activations = Vec::with_capacity(self.layers.len()+1);
+        let mut activations = Vec::with_capacity(self.layers.len());
 
         // Forward propagation
         
         let mut index = 0;
-        activations.push(inputs.clone());
-        for layer in &self.layers {
+        for (i, layer) in self.layers.iter().enumerate() {
             let shape = layer.param_shape();
 
             let slice = unsafe {
@@ -364,7 +363,12 @@ impl<T: Criterion> BaseNeuralNet<T> {
                                             shape.1)
             };
 
-            let output = layer.forward(activations.last().unwrap(), slice);
+            let output = if i == 0 {
+            	layer.forward(inputs, slice)
+            } else {
+            	layer.forward(activations.last().unwrap(), slice)
+            };
+
             activations.push(output);
             index += layer.num_params();
         }
@@ -373,7 +377,7 @@ impl<T: Criterion> BaseNeuralNet<T> {
         // Backward propagation
 
         //The gradient with respect to the current layer's output
-        let mut out_grad = self.criterion.cost_grad(output, targets);
+        let mut out_grad = self.criterion.cost_grad(&output, targets);
         // at this point index == weights.len()
         for (i, layer) in self.layers.iter().enumerate().rev() {
             index -= layer.num_params();
@@ -386,13 +390,15 @@ impl<T: Criterion> BaseNeuralNet<T> {
                                             shape.1)
             };
 
-            let grad_params = layer.back_params(&out_grad, &activations[i], slice);
-            out_grad = layer.back_input(&out_grad, &activations[i], slice);
+
+            let activation = if i == 0 {inputs} else {&activations[i-1]};
+            let grad_params = layer.back_params(&out_grad, activation, slice);
+            out_grad = layer.back_input(&out_grad, activation, slice);
 
             gradients[index..index+layer.num_params()].copy_from_slice(&grad_params.data());
         }
 
-        let cost = self.criterion.cost(output, targets);
+        let cost = self.criterion.cost(&output, targets);
         (cost, gradients)
     }
 

From db35b1a81719b2e22b8676cc26fad03c2c298282 Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Wed, 21 Sep 2016 18:41:45 -0700
Subject: [PATCH 12/28] Added add_layers example

---
 src/learning/nnet/mod.rs | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/learning/nnet/mod.rs b/src/learning/nnet/mod.rs
index ad003245..c4247403 100644
--- a/src/learning/nnet/mod.rs
+++ b/src/learning/nnet/mod.rs
@@ -188,6 +188,7 @@ impl<T, A> NeuralNet<T, A>
     /// let mut net = NeuralNet::new(BCECriterion::default(), StochasticGD::default());
     ///
     /// // Give net an input layer of size 3, hidden layer of size 4, and output layer of size 5
+    /// // This net will not apply any activation function to the Linear layer outputs
     /// net.add(Box::new(Linear::new(3, 4)))
     ///    .add(Box::new(Linear::new(4, 5)));
     /// ```
@@ -197,7 +198,28 @@ impl<T, A> NeuralNet<T, A>
     }
 
     /// Adds multiple layers to the end of the network
-    fn add_layers<'a, U>(&'a mut self, layers: U) -> &'a mut NeuralNet<T, A>
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use rusty_machine::linalg::BaseMatrix;
+    /// use rusty_machine::learning::nnet::BCECriterion;
+    /// use rusty_machine::learning::nnet::NeuralNet;
+    /// use rusty_machine::learning::nnet::net_layer::{NetLayer, Linear};
+    /// use rusty_machine::learning::toolkit::activ_fn::Sigmoid;
+    /// use rusty_machine::learning::optim::grad_desc::StochasticGD;
+    ///
+    /// use std::clone::Clone;
+    ///
+    /// // Create a new neural net 
+    /// let mut net = NeuralNet::new(BCECriterion::default(), StochasticGD::default());
+    ///
+    /// let linear_sig: Vec<Box<NetLayer>> = vec![Box::new(Linear::new(5, 5)), Box::new(Sigmoid)];
+    ///
+    /// // Give net a layer of size 5, followed by a Sigmoid activation function
+    /// net.add_layers(linear_sig);
+    /// ```
+    pub fn add_layers<'a, U>(&'a mut self, layers: U) -> &'a mut NeuralNet<T, A>
     	where U: IntoIterator<Item = Box<NetLayer>> {
     		self.base.add_layers(layers);
     		self

From d96e99edbb859fe74e770c948b1a1e189f6b2256 Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Sat, 24 Sep 2016 12:39:46 -0700
Subject: [PATCH 13/28] improved network performance

---
 benches/examples/k_means.rs    |  4 +--
 benches/examples/nnet.rs       | 11 ++++---
 benches/examples/svm.rs        |  6 ++--
 src/learning/nnet/mod.rs       | 57 +++++++---------------------------
 src/learning/nnet/net_layer.rs | 36 ++++++++++-----------
 5 files changed, 41 insertions(+), 73 deletions(-)

diff --git a/benches/examples/k_means.rs b/benches/examples/k_means.rs
index 05dcda93..8a55698c 100644
--- a/benches/examples/k_means.rs
+++ b/benches/examples/k_means.rs
@@ -49,7 +49,7 @@ fn k_means_train(b: &mut Bencher) {
 
     b.iter(|| {
         let mut model = black_box(KMeansClassifier::new(2));
-        model.train(&samples);
+        let _ = model.train(&samples);
     });
 }
 
@@ -64,7 +64,7 @@ fn k_means_predict(b: &mut Bencher) {
     let samples = generate_data(&centroids, SAMPLES_PER_CENTROID, 0.4);
 
     let mut model = KMeansClassifier::new(2);
-    model.train(&samples);
+    let _ = model.train(&samples);
     b.iter(|| {
         let _ = black_box(model.centroids().as_ref().unwrap());
         let _ = black_box(model.predict(&samples));
diff --git a/benches/examples/nnet.rs b/benches/examples/nnet.rs
index cc567339..faef9892 100644
--- a/benches/examples/nnet.rs
+++ b/benches/examples/nnet.rs
@@ -5,6 +5,7 @@ use std::vec::Vec;
 
 use rusty_machine::learning::nnet::{NeuralNet, BCECriterion};
 use rusty_machine::learning::toolkit::regularization::Regularization;
+use rusty_machine::learning::toolkit::activ_fn::Sigmoid;
 use rusty_machine::learning::optim::grad_desc::StochasticGD;
 
 use rusty_machine::linalg::Matrix;
@@ -51,8 +52,8 @@ fn nnet_and_gate_train(b: &mut Bencher) {
     let criterion = BCECriterion::new(Regularization::L2(0.));
 
     b.iter(|| {
-        let mut model = black_box(NeuralNet::new(layers, criterion, StochasticGD::default()));
-        model.train(&inputs, &targets);
+        let mut model = black_box(NeuralNet::mlp(layers, criterion, StochasticGD::default(), Sigmoid));
+        let _ = model.train(&inputs, &targets);
     })
 }
 
@@ -62,10 +63,10 @@ fn nnet_and_gate_predict(b: &mut Bencher) {
     let layers = &[2, 1];
     let criterion = BCECriterion::new(Regularization::L2(0.));
 
-    let mut model = NeuralNet::new(layers, criterion, StochasticGD::default());
-    model.train(&inputs, &targets);
+    let mut model = NeuralNet::mlp(layers, criterion, StochasticGD::default(), Sigmoid);
+    let _ = model.train(&inputs, &targets);
 
     b.iter(|| {
-        model.predict(&test_inputs);
+        let _ = model.predict(&test_inputs);
     })
 }
diff --git a/benches/examples/svm.rs b/benches/examples/svm.rs
index 1f69eaf4..d6e6e8db 100644
--- a/benches/examples/svm.rs
+++ b/benches/examples/svm.rs
@@ -35,7 +35,7 @@ fn svm_sign_learner_train(b: &mut Bencher) {
     // Trainee
     b.iter(|| {
         let mut svm_mod = black_box(SVM::new(HyperTan::new(100., 0.), 0.3));
-        svm_mod.train(&inputs, &targets); 
+        let _ = svm_mod.train(&inputs, &targets); 
     });
 }
 
@@ -46,8 +46,8 @@ fn svm_sign_learner_predict(b: &mut Bencher) {
     let test_data = (-1000..1000).filter(|&x| x % 100 == 0).map(|x| x as f64).collect::<Vec<_>>();
     let test_inputs = Matrix::new(test_data.len(), 1, test_data);
     let mut svm_mod = SVM::new(HyperTan::new(100., 0.), 0.3);
-    svm_mod.train(&inputs, &targets);
+    let _ = svm_mod.train(&inputs, &targets);
     b.iter(|| {
-        svm_mod.predict(&test_inputs);
+        let _ = svm_mod.predict(&test_inputs);
     });
 }
\ No newline at end of file
diff --git a/src/learning/nnet/mod.rs b/src/learning/nnet/mod.rs
index c4247403..0696a0c7 100644
--- a/src/learning/nnet/mod.rs
+++ b/src/learning/nnet/mod.rs
@@ -45,10 +45,9 @@
 
 pub mod net_layer;
 
-use linalg::{Matrix, MatrixSlice, BaseMatrix, BaseMatrixMut};
+use linalg::{Matrix, MatrixSlice, BaseMatrixMut};
 
 use learning::{LearningResult, SupModel};
-use learning::error::{Error, ErrorKind};
 use learning::toolkit::activ_fn;
 use learning::toolkit::activ_fn::ActivationFunc;
 use learning::toolkit::cost_fn;
@@ -57,9 +56,6 @@ use learning::toolkit::regularization::Regularization;
 use learning::optim::{Optimizable, OptimAlgorithm};
 use learning::optim::grad_desc::StochasticGD;
 
-use rand::thread_rng;
-use rand::distributions::{Sample, range};
-
 use std::fmt::Debug;
 use std::iter::IntoIterator;
 
@@ -305,22 +301,6 @@ impl<T: Criterion> BaseNeuralNet<T> {
     		self
     }
 
-    /// Creates initial weights for all neurons in the network.
-    fn create_weights(layer_sizes: &[usize]) -> Vec<f64> {
-        let mut between = range::Range::new(0f64, 1f64);
-        let mut rng = thread_rng();
-        layer_sizes.windows(2)
-            .flat_map(|w| {
-                let l_in = w[0] + 1;
-                let l_out = w[1];
-                let eps_init = (6f64 / (l_in + l_out) as f64).sqrt();
-                (0..l_in * l_out)
-                    .map(|_i| (between.sample(&mut rng) * 2f64 * eps_init) - eps_init)
-                    .collect::<Vec<_>>()
-            })
-            .collect()
-    }
-
     /// Gets matrix of weights for the specified layer for the weights.
     fn get_layer_weights(&self, weights: &[f64], idx: usize) -> MatrixSlice<f64> {
         debug_assert!(idx < self.layers.len());
@@ -347,18 +327,6 @@ impl<T: Criterion> BaseNeuralNet<T> {
         }
     }
 
-    /// Gets matrix of weights between specified layer and forward layer
-    /// for the base model.
-    fn get_net_weights(&self, idx: usize) -> MatrixSlice<f64> {
-        self.get_layer_weights(&self.weights[..], idx)
-    }
-
-    /// Gets the weights for a layer excluding the bias weights.
-    fn get_non_bias_weights(&self, weights: &[f64], idx: usize) -> MatrixSlice<f64> {
-        let layer_weights = self.get_layer_weights(weights, idx);
-        layer_weights.reslice([1, 0], layer_weights.rows() - 1, layer_weights.cols())
-    }
-
     /// Compute the gradient using the back propagation algorithm.
     fn compute_grad(&self,
                     weights: &[f64],
@@ -426,31 +394,30 @@ impl<T: Criterion> BaseNeuralNet<T> {
 
     /// Forward propagation of the model weights to get the outputs.
     fn forward_prop(&self, inputs: &Matrix<f64>) -> LearningResult<Matrix<f64>> {
-        let mut index = 0;
         if self.layers.len() == 0 {
             return Ok(inputs.clone());
         }
 
+        let mut ptr = self.weights.as_ptr();
         let mut outputs = unsafe {
             let shape = self.layers[0].param_shape();
-            let slice = MatrixSlice::from_raw_parts(self.weights.as_ptr(),
+            let slice = MatrixSlice::from_raw_parts(ptr,
                                                     shape.0,
                                                     shape.1,
                                                     shape.1);
+            ptr = ptr.offset(self.layers[0].num_params() as isize);
             self.layers[0].forward(inputs, slice)
         };
         for layer in self.layers.iter().skip(1) {
             let shape = layer.param_shape();
-
-            let slice = unsafe {
-                MatrixSlice::from_raw_parts(self.weights.as_ptr().offset(index as isize),
-                                            shape.0,
-                                            shape.1,
-                                            shape.1)
-            };
-
-            outputs = layer.forward(&outputs, slice);
-            index += layer.num_params();
+            unsafe {
+                let slice = MatrixSlice::from_raw_parts(ptr,
+                                                        shape.0,
+                                                        shape.1,
+                                                        shape.1);
+                outputs = layer.forward(&outputs, slice);
+                ptr = ptr.offset(layer.num_params() as isize);
+            }
         }
         Ok(outputs)
     }
diff --git a/src/learning/nnet/net_layer.rs b/src/learning/nnet/net_layer.rs
index 07b5a97c..5b1777f4 100644
--- a/src/learning/nnet/net_layer.rs
+++ b/src/learning/nnet/net_layer.rs
@@ -24,11 +24,14 @@ pub trait NetLayer : Debug {
 	/// The default value of the parameters of this layer before training
 	fn default_params(&self) -> Vec<f64>;
 
-	/// The number of parameters used by this layer
-	fn num_params(&self) -> usize;
-
 	/// The shape of the parameters used by this layer
 	fn param_shape(&self) -> (usize, usize);
+
+	/// The number of parameters used by this layer
+	fn num_params(&self) -> usize {
+		let shape = self.param_shape();
+		shape.0 * shape.1
+	}
 }
 
 /// Linear network layer
@@ -84,19 +87,23 @@ impl NetLayer for Linear {
 
 	fn back_input(&self, out_grad: &Matrix<f64>, _: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64> {
 		assert_eq!(out_grad.cols(), params.cols());
-		let gradient = out_grad * &params.into_matrix().transpose();
+		//let gradient = out_grad * &params.into_matrix().transpose();
 		if self.has_bias {
-			let columns: Vec<_> = (0..gradient.cols()-1).collect();
-			gradient.select_cols(&columns)
+			//let columns: Vec<_> = (0..gradient.cols()-1).collect();
+			//gradient.select_cols(&columns)
+			let rows: Vec<_> = (0..params.cols()-1).collect();
+			out_grad * &params.into_matrix().select_rows(&rows).transpose()
 		} else {
-			gradient
+			//gradient
+			out_grad * &params.into_matrix().transpose()
 		}
 	}
 	
 	fn back_params(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
 		assert_eq!(input.rows(), out_grad.rows());
 		if self.has_bias {
-			input.transpose().vcat(&Matrix::<f64>::ones(1, input.rows())) * out_grad
+			//input.transpose().vcat(&Matrix::<f64>::ones(1, input.rows())) * out_grad
+			input.hcat(&Matrix::<f64>::ones(input.rows(), 1)).transpose() * out_grad
 		} else {
 			input.transpose() * out_grad
 		}
@@ -113,10 +120,6 @@ impl NetLayer for Linear {
 											 .collect()
 	}
 
-	fn num_params(&self) -> usize {
-		self.input_size * self.output_size
-	}
-
 	fn param_shape(&self) -> (usize, usize) {
 		(self.input_size, self.output_size)
 	}
@@ -125,8 +128,9 @@ impl NetLayer for Linear {
 impl<T: ActivationFunc + Debug> NetLayer for T {
 	/// Applys the activation function to each element of the input
 	fn forward(&self, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
-	    Matrix::new(input.rows(), input.cols(),
-	        input.iter().map(|&x| T::func(x)).collect::<Vec<_>>())
+		//Matrix::new(input.rows(), input.cols(),
+		//	input.iter().map(|&x| T::func(x)).collect::<Vec<_>>());
+		input.clone().apply(&T::func)
 	}
 
 	fn back_input(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
@@ -143,10 +147,6 @@ impl<T: ActivationFunc + Debug> NetLayer for T {
 		Vec::new()
 	}
 
-	fn num_params(&self) -> usize {
-		0
-	}
-
 	fn param_shape(&self) -> (usize, usize) {
 		(0, 0)
 	}

From 59b280b0c3dff394aadd3528e6c0f57f683f31fd Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Sun, 25 Sep 2016 16:18:04 -0700
Subject: [PATCH 14/28] Fixed doc test error

---
 src/learning/nnet/net_layer.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/learning/nnet/net_layer.rs b/src/learning/nnet/net_layer.rs
index 5b1777f4..6948bb8d 100644
--- a/src/learning/nnet/net_layer.rs
+++ b/src/learning/nnet/net_layer.rs
@@ -91,7 +91,7 @@ impl NetLayer for Linear {
 		if self.has_bias {
 			//let columns: Vec<_> = (0..gradient.cols()-1).collect();
 			//gradient.select_cols(&columns)
-			let rows: Vec<_> = (0..params.cols()-1).collect();
+			let rows: Vec<_> = (0..params.rows()-1).collect();
 			out_grad * &params.into_matrix().select_rows(&rows).transpose()
 		} else {
 			//gradient

From 38bd334af19e5aef245f89c657b81c5d88ca85b4 Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Mon, 26 Sep 2016 16:23:36 -0700
Subject: [PATCH 15/28] backprop uses regularization

---
 src/learning/nnet/mod.rs       | 37 +++++++++++++++++++---------------
 src/learning/nnet/net_layer.rs |  6 +++---
 2 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/src/learning/nnet/mod.rs b/src/learning/nnet/mod.rs
index 0696a0c7..e3deed91 100644
--- a/src/learning/nnet/mod.rs
+++ b/src/learning/nnet/mod.rs
@@ -46,6 +46,7 @@
 pub mod net_layer;
 
 use linalg::{Matrix, MatrixSlice, BaseMatrixMut};
+use rulinalg::utils;
 
 use learning::{LearningResult, SupModel};
 use learning::toolkit::activ_fn;
@@ -57,7 +58,6 @@ use learning::optim::{Optimizable, OptimAlgorithm};
 use learning::optim::grad_desc::StochasticGD;
 
 use std::fmt::Debug;
-use std::iter::IntoIterator;
 
 use self::net_layer::NetLayer;
 
@@ -337,8 +337,10 @@ impl<T: Criterion> BaseNeuralNet<T> {
         unsafe {
             gradients.set_len(weights.len());
         }
-        //activations[0] is input and activations[i+1] is output of layer[i]
+        // activations[i] is output of layer[i]
         let mut activations = Vec::with_capacity(self.layers.len());
+        // params[i] is the weights for layer[i]
+        let mut params = Vec::with_capacity(self.layers.len());
 
         // Forward propagation
         
@@ -360,35 +362,38 @@ impl<T: Criterion> BaseNeuralNet<T> {
             };
 
             activations.push(output);
+            params.push(slice);
             index += layer.num_params();
         }
         let output = activations.last().unwrap();
 
         // Backward propagation
 
-        //The gradient with respect to the current layer's output
+        // The gradient with respect to the current layer's output
         let mut out_grad = self.criterion.cost_grad(&output, targets);
         // at this point index == weights.len()
         for (i, layer) in self.layers.iter().enumerate().rev() {
-            index -= layer.num_params();
-            let shape = layer.param_shape();
-
-            let slice = unsafe {
-                MatrixSlice::from_raw_parts(weights.as_ptr().offset(index as isize),
-                                            shape.0,
-                                            shape.1,
-                                            shape.1)
-            };
+            let activation = if i == 0 {inputs} else {&activations[i-1]};
+            let mut grad_params = layer.back_params(&out_grad, activation, params[i]);
 
+            if self.criterion.is_regularized() {
+                utils::in_place_vec_bin_op(grad_params.mut_data(), self.criterion.reg_cost_grad(params[i]).data(), |x, &y| {
+                    *x = *x + y
+                });
+            }
 
-            let activation = if i == 0 {inputs} else {&activations[i-1]};
-            let grad_params = layer.back_params(&out_grad, activation, slice);
-            out_grad = layer.back_input(&out_grad, activation, slice);
+            out_grad = layer.back_input(&out_grad, activation, params[i]);
 
+            index -= layer.num_params();
             gradients[index..index+layer.num_params()].copy_from_slice(&grad_params.data());
         }
 
-        let cost = self.criterion.cost(&output, targets);
+        let mut cost = self.criterion.cost(&output, targets);
+        if self.criterion.is_regularized() {
+            for i in 0..self.layers.len() {
+                cost += self.criterion.reg_cost(params[i]);
+            }
+        }
         (cost, gradients)
     }
 
diff --git a/src/learning/nnet/net_layer.rs b/src/learning/nnet/net_layer.rs
index 6948bb8d..9979011b 100644
--- a/src/learning/nnet/net_layer.rs
+++ b/src/learning/nnet/net_layer.rs
@@ -77,16 +77,16 @@ impl NetLayer for Linear {
 	/// where N is the number of samples and I is the dimensionality of the input
 	fn forward(&self, input: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64> {
 		if self.has_bias {
-			assert_eq!(input.cols()+1, params.rows());
+			debug_assert_eq!(input.cols()+1, params.rows());
 			input.hcat(&Matrix::<f64>::ones(input.rows(), 1)) * &params
 		} else {
-			assert_eq!(input.cols(), params.rows());
+			debug_assert_eq!(input.cols(), params.rows());
 			input * &params
 		}
 	}
 
 	fn back_input(&self, out_grad: &Matrix<f64>, _: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64> {
-		assert_eq!(out_grad.cols(), params.cols());
+		debug_assert_eq!(out_grad.cols(), params.cols());
 		//let gradient = out_grad * &params.into_matrix().transpose();
 		if self.has_bias {
 			//let columns: Vec<_> = (0..gradient.cols()-1).collect();

From 90e6a8c83b080065ba433ff03520183916da1ac8 Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Tue, 4 Oct 2016 11:16:33 -0700
Subject: [PATCH 16/28] Added error handling to neural nets

---
 src/learning/nnet/mod.rs       | 21 +++++++++++++++------
 src/learning/nnet/net_layer.rs | 28 ++++++++++++++++++----------
 2 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/src/learning/nnet/mod.rs b/src/learning/nnet/mod.rs
index e3deed91..3f4600c1 100644
--- a/src/learning/nnet/mod.rs
+++ b/src/learning/nnet/mod.rs
@@ -49,6 +49,7 @@ use linalg::{Matrix, MatrixSlice, BaseMatrixMut};
 use rulinalg::utils;
 
 use learning::{LearningResult, SupModel};
+use learning::error::{Error, ErrorKind};
 use learning::toolkit::activ_fn;
 use learning::toolkit::activ_fn::ActivationFunc;
 use learning::toolkit::cost_fn;
@@ -277,7 +278,11 @@ impl<T: Criterion> BaseNeuralNet<T> {
     /// Create a multilayer perceptron with the specified layer sizes.
     fn mlp<'a, U>(layer_sizes: &[usize], criterion: T, activ_fn: U) -> BaseNeuralNet<T> 
         where U: ActivationFunc + 'static {
-        let mut mlp = BaseNeuralNet::new(criterion);
+        let mut mlp = BaseNeuralNet {
+            layers: Vec::with_capacity(2*(layer_sizes.len()-1)),
+            weights: Vec::new(),
+            criterion: criterion
+        };
         for shape in layer_sizes.windows(2) {
             mlp.add(Box::new(net_layer::Linear::new(shape[0], shape[1])));
             mlp.add(Box::new(activ_fn.clone()));
@@ -356,9 +361,9 @@ impl<T: Criterion> BaseNeuralNet<T> {
             };
 
             let output = if i == 0 {
-            	layer.forward(inputs, slice)
+            	layer.forward(inputs, slice).unwrap()
             } else {
-            	layer.forward(activations.last().unwrap(), slice)
+            	layer.forward(activations.last().unwrap(), slice).unwrap()
             };
 
             activations.push(output);
@@ -374,8 +379,8 @@ impl<T: Criterion> BaseNeuralNet<T> {
         // at this point index == weights.len()
         for (i, layer) in self.layers.iter().enumerate().rev() {
             let activation = if i == 0 {inputs} else {&activations[i-1]};
-            let mut grad_params = layer.back_params(&out_grad, activation, params[i]);
 
+            let mut grad_params = layer.back_params(&out_grad, activation, params[i]);
             if self.criterion.is_regularized() {
                 utils::in_place_vec_bin_op(grad_params.mut_data(), self.criterion.reg_cost_grad(params[i]).data(), |x, &y| {
                     *x = *x + y
@@ -411,7 +416,7 @@ impl<T: Criterion> BaseNeuralNet<T> {
                                                     shape.1,
                                                     shape.1);
             ptr = ptr.offset(self.layers[0].num_params() as isize);
-            self.layers[0].forward(inputs, slice)
+            try!(self.layers[0].forward(inputs, slice))
         };
         for layer in self.layers.iter().skip(1) {
             let shape = layer.param_shape();
@@ -420,7 +425,11 @@ impl<T: Criterion> BaseNeuralNet<T> {
                                                         shape.0,
                                                         shape.1,
                                                         shape.1);
-                outputs = layer.forward(&outputs, slice);
+                outputs = match layer.forward(&outputs, slice) {
+                    Ok(act) => act,
+                    Err(_) => {return Err(Error::new(ErrorKind::InvalidParameters,
+                        "The network's layers do not line up correctly."))}
+                };
                 ptr = ptr.offset(layer.num_params() as isize);
             }
         }
diff --git a/src/learning/nnet/net_layer.rs b/src/learning/nnet/net_layer.rs
index 9979011b..61e31210 100644
--- a/src/learning/nnet/net_layer.rs
+++ b/src/learning/nnet/net_layer.rs
@@ -2,6 +2,8 @@
 
 use linalg::{Matrix, MatrixSlice, BaseMatrix, BaseMatrixMut};
 
+use learning::LearningResult;
+use learning::error::{Error, ErrorKind};
 use learning::toolkit::activ_fn::ActivationFunc;
 
 use rand::thread_rng;
@@ -13,7 +15,7 @@ use std::fmt::Debug;
 /// Trait for neural net layers
 pub trait NetLayer : Debug {
 	/// The result of propogating data forward through this layer
-	fn forward(&self, input: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64>;
+	fn forward(&self, input: &Matrix<f64>, params: MatrixSlice<f64>) -> LearningResult<Matrix<f64>>;
 
 	/// The gradient of the output of this layer with respect to its input
 	fn back_input(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64>;
@@ -75,13 +77,19 @@ impl NetLayer for Linear {
 	///
 	/// input should have dimensions N x I
 	/// where N is the number of samples and I is the dimensionality of the input
-	fn forward(&self, input: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64> {
+	fn forward(&self, input: &Matrix<f64>, params: MatrixSlice<f64>) -> LearningResult<Matrix<f64>> {
 		if self.has_bias {
-			debug_assert_eq!(input.cols()+1, params.rows());
-			input.hcat(&Matrix::<f64>::ones(input.rows(), 1)) * &params
+			if input.cols()+1 != params.rows() {
+				Err(Error::new(ErrorKind::InvalidData, "The input had the wrong number of columns"))
+			} else {
+				Ok(input.hcat(&Matrix::<f64>::ones(input.rows(), 1)) * &params)
+			}
 		} else {
-			debug_assert_eq!(input.cols(), params.rows());
-			input * &params
+			if input.cols() != params.rows() {
+				Err(Error::new(ErrorKind::InvalidData, "The input had the wrong number of columns"))
+			} else {
+				Ok(input * &params)
+			}
 		}
 	}
 
@@ -100,7 +108,7 @@ impl NetLayer for Linear {
 	}
 	
 	fn back_params(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
-		assert_eq!(input.rows(), out_grad.rows());
+		debug_assert_eq!(input.rows(), out_grad.rows());
 		if self.has_bias {
 			//input.transpose().vcat(&Matrix::<f64>::ones(1, input.rows())) * out_grad
 			input.hcat(&Matrix::<f64>::ones(input.rows(), 1)).transpose() * out_grad
@@ -127,10 +135,10 @@ impl NetLayer for Linear {
 
 impl<T: ActivationFunc + Debug> NetLayer for T {
 	/// Applys the activation function to each element of the input
-	fn forward(&self, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
+	fn forward(&self, input: &Matrix<f64>, _: MatrixSlice<f64>) -> LearningResult<Matrix<f64>> {
 		//Matrix::new(input.rows(), input.cols(),
-		//	input.iter().map(|&x| T::func(x)).collect::<Vec<_>>());
-		input.clone().apply(&T::func)
+		//	input.iter().map(|&x| T::func(x)).collect::<Vec<_>>())
+		Ok(input.clone().apply(&T::func))
 	}
 
 	fn back_input(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {

From 9d056292d130546a6f3dcd6ee7bd6fe77eb3823f Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Fri, 7 Oct 2016 22:28:36 -0700
Subject: [PATCH 17/28] Improve neural network performance

---
 src/learning/nnet/mod.rs       | 22 +++++++++++-----------
 src/learning/nnet/net_layer.rs | 29 +++++++++++++----------------
 2 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/src/learning/nnet/mod.rs b/src/learning/nnet/mod.rs
index 3f4600c1..093a0027 100644
--- a/src/learning/nnet/mod.rs
+++ b/src/learning/nnet/mod.rs
@@ -342,7 +342,7 @@ impl<T: Criterion> BaseNeuralNet<T> {
         unsafe {
             gradients.set_len(weights.len());
         }
-        // activations[i] is output of layer[i]
+        // activations[i] is the output of layer[i]
         let mut activations = Vec::with_capacity(self.layers.len());
         // params[i] is the weights for layer[i]
         let mut params = Vec::with_capacity(self.layers.len());
@@ -370,30 +370,30 @@ impl<T: Criterion> BaseNeuralNet<T> {
             params.push(slice);
             index += layer.num_params();
         }
-        let output = activations.last().unwrap();
+        let output = &activations[activations.len()-1];
 
         // Backward propagation
-
+        
         // The gradient with respect to the current layer's output
-        let mut out_grad = self.criterion.cost_grad(&output, targets);
+        let mut out_grad = self.criterion.cost_grad(output, targets);
         // at this point index == weights.len()
         for (i, layer) in self.layers.iter().enumerate().rev() {
             let activation = if i == 0 {inputs} else {&activations[i-1]};
+            index -= layer.num_params();
 
-            let mut grad_params = layer.back_params(&out_grad, activation, params[i]);
+            let grad_params = &mut gradients[index..index+layer.num_params()];
+            grad_params.copy_from_slice(&layer.back_params(&out_grad, activation, params[i]).data());
+            
             if self.criterion.is_regularized() {
-                utils::in_place_vec_bin_op(grad_params.mut_data(), self.criterion.reg_cost_grad(params[i]).data(), |x, &y| {
+                utils::in_place_vec_bin_op(grad_params, self.criterion.reg_cost_grad(params[i]).data(), |x, &y| {
                     *x = *x + y
                 });
             }
-
+            
             out_grad = layer.back_input(&out_grad, activation, params[i]);
-
-            index -= layer.num_params();
-            gradients[index..index+layer.num_params()].copy_from_slice(&grad_params.data());
         }
 
-        let mut cost = self.criterion.cost(&output, targets);
+        let mut cost = self.criterion.cost(output, targets);
         if self.criterion.is_regularized() {
             for i in 0..self.layers.len() {
                 cost += self.criterion.reg_cost(params[i]);
diff --git a/src/learning/nnet/net_layer.rs b/src/learning/nnet/net_layer.rs
index 61e31210..ce37afa7 100644
--- a/src/learning/nnet/net_layer.rs
+++ b/src/learning/nnet/net_layer.rs
@@ -1,6 +1,7 @@
 //!Neural Network Layers
 
 use linalg::{Matrix, MatrixSlice, BaseMatrix, BaseMatrixMut};
+use rulinalg::utils;
 
 use learning::LearningResult;
 use learning::error::{Error, ErrorKind};
@@ -82,7 +83,7 @@ impl NetLayer for Linear {
 			if input.cols()+1 != params.rows() {
 				Err(Error::new(ErrorKind::InvalidData, "The input had the wrong number of columns"))
 			} else {
-				Ok(input.hcat(&Matrix::<f64>::ones(input.rows(), 1)) * &params)
+				Ok(&input.hcat(&Matrix::ones(input.rows(), 1)) * &params)
 			}
 		} else {
 			if input.cols() != params.rows() {
@@ -95,25 +96,21 @@ impl NetLayer for Linear {
 
 	fn back_input(&self, out_grad: &Matrix<f64>, _: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64> {
 		debug_assert_eq!(out_grad.cols(), params.cols());
-		//let gradient = out_grad * &params.into_matrix().transpose();
+		let gradient = out_grad * &params.transpose();
 		if self.has_bias {
-			//let columns: Vec<_> = (0..gradient.cols()-1).collect();
-			//gradient.select_cols(&columns)
-			let rows: Vec<_> = (0..params.rows()-1).collect();
-			out_grad * &params.into_matrix().select_rows(&rows).transpose()
+			let columns: Vec<_> = (0..gradient.cols()-1).collect();
+			gradient.select_cols(&columns)
 		} else {
-			//gradient
-			out_grad * &params.into_matrix().transpose()
+			gradient
 		}
 	}
 	
 	fn back_params(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
 		debug_assert_eq!(input.rows(), out_grad.rows());
 		if self.has_bias {
-			//input.transpose().vcat(&Matrix::<f64>::ones(1, input.rows())) * out_grad
-			input.hcat(&Matrix::<f64>::ones(input.rows(), 1)).transpose() * out_grad
+			&input.hcat(&Matrix::ones(input.rows(), 1)).transpose() * out_grad
 		} else {
-			input.transpose() * out_grad
+			&input.transpose() * out_grad
 		}
 	}
 
@@ -136,15 +133,15 @@ impl NetLayer for Linear {
 impl<T: ActivationFunc + Debug> NetLayer for T {
 	/// Applys the activation function to each element of the input
 	fn forward(&self, input: &Matrix<f64>, _: MatrixSlice<f64>) -> LearningResult<Matrix<f64>> {
-		//Matrix::new(input.rows(), input.cols(),
-		//	input.iter().map(|&x| T::func(x)).collect::<Vec<_>>())
 		Ok(input.clone().apply(&T::func))
 	}
 
 	fn back_input(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
-		let in_grad = Matrix::new(input.rows(), input.cols(),
-	        			input.iter().map(|&x| T::func_grad(x)).collect::<Vec<_>>());
-		out_grad.elemul(&in_grad)
+		let mut in_grad = input.clone();
+		utils::in_place_vec_bin_op(in_grad.mut_data(), out_grad.data(), |x, &y| {
+            *x = T::func_grad(*x) * y
+        });
+		in_grad
 	}
 	
 	fn back_params(&self, _: &Matrix<f64>, _: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {

From d555bef66ed85af0ba7f4fd52464f4bc634838ac Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Fri, 7 Oct 2016 22:30:44 -0700
Subject: [PATCH 18/28] removed ActFunc from Criterion

---
 src/learning/nnet/mod.rs | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/src/learning/nnet/mod.rs b/src/learning/nnet/mod.rs
index 093a0027..4cc42afa 100644
--- a/src/learning/nnet/mod.rs
+++ b/src/learning/nnet/mod.rs
@@ -45,7 +45,7 @@
 
 pub mod net_layer;
 
-use linalg::{Matrix, MatrixSlice, BaseMatrixMut};
+use linalg::{Matrix, MatrixSlice};
 use rulinalg::utils;
 
 use learning::{LearningResult, SupModel};
@@ -383,7 +383,7 @@ impl<T: Criterion> BaseNeuralNet<T> {
 
             let grad_params = &mut gradients[index..index+layer.num_params()];
             grad_params.copy_from_slice(&layer.back_params(&out_grad, activation, params[i]).data());
-            
+
             if self.criterion.is_regularized() {
                 utils::in_place_vec_bin_op(grad_params, self.criterion.reg_cost_grad(params[i]).data(), |x, &y| {
                     *x = *x + y
@@ -457,21 +457,9 @@ impl<T: Criterion> Optimizable for BaseNeuralNet<T> {
 ///
 /// Specifies an activation function and a cost function.
 pub trait Criterion {
-    /// The activation function for the criterion.
-    type ActFunc: ActivationFunc + Debug;
     /// The cost function for the criterion.
     type Cost: CostFunc<Matrix<f64>>;
 
-    /// The activation function applied to a matrix.
-    fn activate(&self, mat: Matrix<f64>) -> Matrix<f64> {
-        mat.apply(&Self::ActFunc::func)
-    }
-
-    /// The gradient of the activation function applied to a matrix.
-    fn grad_activ(&self, mat: Matrix<f64>) -> Matrix<f64> {
-        mat.apply(&Self::ActFunc::func_grad)
-    }
-
     /// The cost function.
     ///
     /// Returns a scalar cost.
@@ -534,7 +522,6 @@ pub struct BCECriterion {
 }
 
 impl Criterion for BCECriterion {
-    type ActFunc = activ_fn::Sigmoid;
     type Cost = cost_fn::CrossEntropyError;
 
     fn regularization(&self) -> Regularization<f64> {
@@ -576,7 +563,6 @@ pub struct MSECriterion {
 }
 
 impl Criterion for MSECriterion {
-    type ActFunc = activ_fn::Linear;
     type Cost = cost_fn::MeanSqError;
 
     fn regularization(&self) -> Regularization<f64> {

From 999524e5e88d1a8ba6164ed554bf6ca0cbd4b2f7 Mon Sep 17 00:00:00 2001
From: Johann Tuffe <tafia973@gmail.com>
Date: Sat, 8 Oct 2016 16:57:21 +0800
Subject: [PATCH 19/28] Replace select_cols with simpler sub_slice.into plus
 some reformatting

---
 src/learning/nnet/mod.rs       |  35 ++---
 src/learning/nnet/net_layer.rs | 243 +++++++++++++++++----------------
 2 files changed, 137 insertions(+), 141 deletions(-)

diff --git a/src/learning/nnet/mod.rs b/src/learning/nnet/mod.rs
index 4cc42afa..13dd2cca 100644
--- a/src/learning/nnet/mod.rs
+++ b/src/learning/nnet/mod.rs
@@ -217,9 +217,9 @@ impl<T, A> NeuralNet<T, A>
     /// net.add_layers(linear_sig);
     /// ```
     pub fn add_layers<'a, U>(&'a mut self, layers: U) -> &'a mut NeuralNet<T, A>
-    	where U: IntoIterator<Item = Box<NetLayer>> {
-    		self.base.add_layers(layers);
-    		self
+        where U: IntoIterator<Item = Box<NetLayer>> {
+            self.base.add_layers(layers);
+            self
     }
 
     /// Gets matrix of weights between specified layer and forward layer.
@@ -247,7 +247,7 @@ impl<T, A> NeuralNet<T, A>
 
 /// Base Neural Network struct
 ///
-/// This struct cannot be instantianated and is used internally only.
+/// This struct cannot be instantiated and is used internally only.
 #[derive(Debug)]
 pub struct BaseNeuralNet<T: Criterion> {
     layers: Vec<Box<NetLayer>>,
@@ -299,11 +299,12 @@ impl<T: Criterion> BaseNeuralNet<T> {
 
     /// Adds multiple layers to the end of the network
     fn add_layers<'a, U>(&'a mut self, layers: U) -> &'a mut BaseNeuralNet<T>
-    	where U: IntoIterator<Item = Box<NetLayer>> {
-    		for layer in layers {
-    			self.add(layer);
-    		}
-    		self
+        where U: IntoIterator<Item = Box<NetLayer>> 
+    {
+        for layer in layers {
+            self.add(layer);
+        }
+        self
     }
 
     /// Gets matrix of weights for the specified layer for the weights.
@@ -311,17 +312,11 @@ impl<T: Criterion> BaseNeuralNet<T> {
         debug_assert!(idx < self.layers.len());
 
         // Check that the weights are the right size.
-        let mut full_size = 0usize;
-        for l in &self.layers {
-            full_size += l.num_params();
-        }
+        let full_size: usize = self.layers.iter().map(|l| l.num_params()).sum();
 
         debug_assert_eq!(full_size, weights.len());
 
-        let mut start = 0usize;
-        for l in &self.layers[..idx] {
-            start += l.num_params();
-        } 
+        let start: usize = self.layers.iter().take(idx).map(|l| l.num_params()).sum();
 
         let shape = self.layers[idx].param_shape();
         unsafe {
@@ -361,9 +356,9 @@ impl<T: Criterion> BaseNeuralNet<T> {
             };
 
             let output = if i == 0 {
-            	layer.forward(inputs, slice).unwrap()
+                layer.forward(inputs, slice).unwrap()
             } else {
-            	layer.forward(activations.last().unwrap(), slice).unwrap()
+                layer.forward(activations.last().unwrap(), slice).unwrap()
             };
 
             activations.push(output);
@@ -404,7 +399,7 @@ impl<T: Criterion> BaseNeuralNet<T> {
 
     /// Forward propagation of the model weights to get the outputs.
     fn forward_prop(&self, inputs: &Matrix<f64>) -> LearningResult<Matrix<f64>> {
-        if self.layers.len() == 0 {
+        if self.layers.is_empty() {
             return Ok(inputs.clone());
         }
 
diff --git a/src/learning/nnet/net_layer.rs b/src/learning/nnet/net_layer.rs
index ce37afa7..cbf6e2be 100644
--- a/src/learning/nnet/net_layer.rs
+++ b/src/learning/nnet/net_layer.rs
@@ -15,26 +15,26 @@ use std::fmt::Debug;
 
 /// Trait for neural net layers
 pub trait NetLayer : Debug {
-	/// The result of propogating data forward through this layer
-	fn forward(&self, input: &Matrix<f64>, params: MatrixSlice<f64>) -> LearningResult<Matrix<f64>>;
-
-	/// The gradient of the output of this layer with respect to its input
-	fn back_input(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64>;
-	
-	/// The gradient of the output of this layer with respect to its parameters
-	fn back_params(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64>;
-
-	/// The default value of the parameters of this layer before training
-	fn default_params(&self) -> Vec<f64>;
-
-	/// The shape of the parameters used by this layer
-	fn param_shape(&self) -> (usize, usize);
-
-	/// The number of parameters used by this layer
-	fn num_params(&self) -> usize {
-		let shape = self.param_shape();
-		shape.0 * shape.1
-	}
+    /// The result of propogating data forward through this layer
+    fn forward(&self, input: &Matrix<f64>, params: MatrixSlice<f64>) -> LearningResult<Matrix<f64>>;
+
+    /// The gradient of the output of this layer with respect to its input
+    fn back_input(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64>;
+    
+    /// The gradient of the output of this layer with respect to its parameters
+    fn back_params(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64>;
+
+    /// The default value of the parameters of this layer before training
+    fn default_params(&self) -> Vec<f64>;
+
+    /// The shape of the parameters used by this layer
+    fn param_shape(&self) -> (usize, usize);
+
+    /// The number of parameters used by this layer
+    fn num_params(&self) -> usize {
+        let shape = self.param_shape();
+        shape.0 * shape.1
+    }
 }
 
 /// Linear network layer
@@ -44,115 +44,116 @@ pub trait NetLayer : Debug {
 /// The parameters are a matrix of weights of size I x O
 /// where O is the dimensionality of the output and I the dimensionality of the input
 #[derive(Debug, Clone, Copy)]
-pub struct Linear {
-	/// The number of dimensions of the input
-	input_size: usize,
-	/// The number of dimensions of the output
-	output_size: usize,
-	/// Whether or not to include a bias term
-	has_bias: bool,
+pub struct Linear { 
+    /// The number of dimensions of the input
+    input_size: usize,
+    /// The number of dimensions of the output
+    output_size: usize,
+    /// Whether or not to include a bias term
+    has_bias: bool,
 }
 
 impl Linear {
-	/// Construct a new Linear layer
-	pub fn new(input_size: usize, output_size: usize) -> Linear {
-		Linear {
-			input_size: input_size + 1, 
-			output_size: output_size,
-			has_bias: true
-		}
-	}
-
-	/// Construct a Linear layer with a bias term
-	pub fn without_bias(input_size: usize, output_size: usize) -> Linear {
-		Linear {
-			input_size: input_size, 
-			output_size: output_size,
-			has_bias: false
-		}
-	}
+    /// Construct a new Linear layer
+    pub fn new(input_size: usize, output_size: usize) -> Linear {
+        Linear {
+            input_size: input_size + 1, 
+            output_size: output_size,
+            has_bias: true
+        }
+    }
+
+    /// Construct a Linear layer with a bias term
+    pub fn without_bias(input_size: usize, output_size: usize) -> Linear {
+        Linear {
+            input_size: input_size, 
+            output_size: output_size,
+            has_bias: false
+        }
+    }
 }
 
 impl NetLayer for Linear {
-	/// Computes a matrix product
-	///
-	/// input should have dimensions N x I
-	/// where N is the number of samples and I is the dimensionality of the input
-	fn forward(&self, input: &Matrix<f64>, params: MatrixSlice<f64>) -> LearningResult<Matrix<f64>> {
-		if self.has_bias {
-			if input.cols()+1 != params.rows() {
-				Err(Error::new(ErrorKind::InvalidData, "The input had the wrong number of columns"))
-			} else {
-				Ok(&input.hcat(&Matrix::ones(input.rows(), 1)) * &params)
-			}
-		} else {
-			if input.cols() != params.rows() {
-				Err(Error::new(ErrorKind::InvalidData, "The input had the wrong number of columns"))
-			} else {
-				Ok(input * &params)
-			}
-		}
-	}
-
-	fn back_input(&self, out_grad: &Matrix<f64>, _: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64> {
-		debug_assert_eq!(out_grad.cols(), params.cols());
-		let gradient = out_grad * &params.transpose();
-		if self.has_bias {
-			let columns: Vec<_> = (0..gradient.cols()-1).collect();
-			gradient.select_cols(&columns)
-		} else {
-			gradient
-		}
-	}
-	
-	fn back_params(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
-		debug_assert_eq!(input.rows(), out_grad.rows());
-		if self.has_bias {
-			&input.hcat(&Matrix::ones(input.rows(), 1)).transpose() * out_grad
-		} else {
-			&input.transpose() * out_grad
-		}
-	}
-
-	/// Initializes weights using Xavier initialization
-	///
-	/// weights drawn from gaussian distribution with 0 mean and variance 2/(input_size+output_size)
-	fn default_params(&self) -> Vec<f64> {
-		let mut distro = Normal::new(0.0, (2.0/(self.input_size+self.output_size) as f64).sqrt());
-		let mut rng = thread_rng();
-
-		(0..self.input_size*self.output_size).map(|_| distro.sample(&mut rng))
-											 .collect()
-	}
-
-	fn param_shape(&self) -> (usize, usize) {
-		(self.input_size, self.output_size)
-	}
+    /// Computes a matrix product
+    ///
+    /// input should have dimensions N x I
+    /// where N is the number of samples and I is the dimensionality of the input
+    fn forward(&self, input: &Matrix<f64>, params: MatrixSlice<f64>) -> LearningResult<Matrix<f64>> {
+        if self.has_bias {
+            if input.cols()+1 != params.rows() {
+                Err(Error::new(ErrorKind::InvalidData, "The input had the wrong number of columns"))
+            } else {
+                Ok(&input.hcat(&Matrix::ones(input.rows(), 1)) * &params)
+            }
+        } else {
+            if input.cols() != params.rows() {
+                Err(Error::new(ErrorKind::InvalidData, "The input had the wrong number of columns"))
+            } else {
+                Ok(input * &params)
+            }
+        }
+    }
+
+    fn back_input(&self, out_grad: &Matrix<f64>, _: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64> {
+        debug_assert_eq!(out_grad.cols(), params.cols());
+        let gradient = out_grad * &params.transpose();
+        if self.has_bias {
+            let rows = gradient.rows();
+            let cols = gradient.cols() - 1;
+            gradient.sub_slice([0, 0], rows, cols).into() 
+        } else {
+            gradient
+        }
+    }
+    
+    fn back_params(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
+        debug_assert_eq!(input.rows(), out_grad.rows());
+        if self.has_bias {
+            &input.hcat(&Matrix::ones(input.rows(), 1)).transpose() * out_grad
+        } else {
+            &input.transpose() * out_grad
+        }
+    }
+
+    /// Initializes weights using Xavier initialization
+    ///
+    /// weights drawn from gaussian distribution with 0 mean and variance 2/(input_size+output_size)
+    fn default_params(&self) -> Vec<f64> {
+        let mut distro = Normal::new(0.0, (2.0/(self.input_size+self.output_size) as f64).sqrt());
+        let mut rng = thread_rng();
+
+        (0..self.input_size*self.output_size).map(|_| distro.sample(&mut rng))
+                                             .collect()
+    }
+
+    fn param_shape(&self) -> (usize, usize) {
+        (self.input_size, self.output_size)
+    }
 }
 
 impl<T: ActivationFunc + Debug> NetLayer for T {
-	/// Applys the activation function to each element of the input
-	fn forward(&self, input: &Matrix<f64>, _: MatrixSlice<f64>) -> LearningResult<Matrix<f64>> {
-		Ok(input.clone().apply(&T::func))
-	}
-
-	fn back_input(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
-		let mut in_grad = input.clone();
-		utils::in_place_vec_bin_op(in_grad.mut_data(), out_grad.data(), |x, &y| {
+    /// Applys the activation function to each element of the input
+    fn forward(&self, input: &Matrix<f64>, _: MatrixSlice<f64>) -> LearningResult<Matrix<f64>> {
+        Ok(input.clone().apply(&T::func))
+    }
+
+    fn back_input(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
+        let mut in_grad = input.clone();
+        utils::in_place_vec_bin_op(in_grad.mut_data(), out_grad.data(), |x, &y| {
             *x = T::func_grad(*x) * y
         });
-		in_grad
-	}
-	
-	fn back_params(&self, _: &Matrix<f64>, _: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
-		Matrix::new(0, 0, Vec::new())
-	}
-
-	fn default_params(&self) -> Vec<f64> {
-		Vec::new()
-	}
-
-	fn param_shape(&self) -> (usize, usize) {
-		(0, 0)
-	}
-}
\ No newline at end of file
+        in_grad
+    }
+    
+    fn back_params(&self, _: &Matrix<f64>, _: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
+        Matrix::new(0, 0, Vec::new())
+    }
+
+    fn default_params(&self) -> Vec<f64> {
+        Vec::new()
+    }
+
+    fn param_shape(&self) -> (usize, usize) {
+        (0, 0)
+    }
+}

From af179955331483efb053c0faf991c47046f2ca1e Mon Sep 17 00:00:00 2001
From: Johann Tuffe <tafia973@gmail.com>
Date: Mon, 10 Oct 2016 00:48:00 +0800
Subject: [PATCH 20/28] extend_from_slice instead of append

---
 src/learning/nnet/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/learning/nnet/mod.rs b/src/learning/nnet/mod.rs
index 13dd2cca..a9ff43a8 100644
--- a/src/learning/nnet/mod.rs
+++ b/src/learning/nnet/mod.rs
@@ -292,7 +292,7 @@ impl<T: Criterion> BaseNeuralNet<T> {
 
     /// Adds the specified layer to the end of the network
     fn add<'a>(&'a mut self, layer: Box<NetLayer>) -> &'a mut BaseNeuralNet<T> {
-        self.weights.append(&mut layer.default_params());
+        self.weights.extend_from_slice(&layer.default_params());
         self.layers.push(layer);
         self
     }

From 4c7ed37c3ad83b00ab2b52de29ccb8c037c71a6b Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Fri, 14 Oct 2016 10:50:57 -0700
Subject: [PATCH 21/28] bias appended to beginning of inputs

---
 src/learning/nnet/net_layer.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/learning/nnet/net_layer.rs b/src/learning/nnet/net_layer.rs
index cbf6e2be..3bc70001 100644
--- a/src/learning/nnet/net_layer.rs
+++ b/src/learning/nnet/net_layer.rs
@@ -83,7 +83,7 @@ impl NetLayer for Linear {
             if input.cols()+1 != params.rows() {
                 Err(Error::new(ErrorKind::InvalidData, "The input had the wrong number of columns"))
             } else {
-                Ok(&input.hcat(&Matrix::ones(input.rows(), 1)) * &params)
+                Ok(&Matrix::ones(input.rows(), 1).hcat(input) * &params)
             }
         } else {
             if input.cols() != params.rows() {
@@ -100,7 +100,7 @@ impl NetLayer for Linear {
         if self.has_bias {
             let rows = gradient.rows();
             let cols = gradient.cols() - 1;
-            gradient.sub_slice([0, 0], rows, cols).into() 
+            gradient.sub_slice([0, 1], rows, cols).into() 
         } else {
             gradient
         }
@@ -109,7 +109,7 @@ impl NetLayer for Linear {
     fn back_params(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
         debug_assert_eq!(input.rows(), out_grad.rows());
         if self.has_bias {
-            &input.hcat(&Matrix::ones(input.rows(), 1)).transpose() * out_grad
+            &Matrix::ones(input.rows(), 1).hcat(input).transpose() * out_grad
         } else {
             &input.transpose() * out_grad
         }

From fcca7b72c3898689d3eb20fe90b7ca947796d65f Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Wed, 26 Oct 2016 02:06:25 -0700
Subject: [PATCH 22/28] Compute regularization cost/grad in one step

---
 src/learning/nnet/mod.rs       | 16 +++++++---------
 src/learning/nnet/net_layer.rs |  2 +-
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/learning/nnet/mod.rs b/src/learning/nnet/mod.rs
index a9ff43a8..1161bd71 100644
--- a/src/learning/nnet/mod.rs
+++ b/src/learning/nnet/mod.rs
@@ -378,21 +378,19 @@ impl<T: Criterion> BaseNeuralNet<T> {
 
             let grad_params = &mut gradients[index..index+layer.num_params()];
             grad_params.copy_from_slice(&layer.back_params(&out_grad, activation, params[i]).data());
-
-            if self.criterion.is_regularized() {
-                utils::in_place_vec_bin_op(grad_params, self.criterion.reg_cost_grad(params[i]).data(), |x, &y| {
-                    *x = *x + y
-                });
-            }
             
             out_grad = layer.back_input(&out_grad, activation, params[i]);
         }
 
         let mut cost = self.criterion.cost(output, targets);
         if self.criterion.is_regularized() {
-            for i in 0..self.layers.len() {
-                cost += self.criterion.reg_cost(params[i]);
-            }
+            let all_params = unsafe {
+                MatrixSlice::from_raw_parts(weights.as_ptr(), weights.len(), 1, 1)
+            };
+            utils::in_place_vec_bin_op(&mut gradients,
+                                       self.criterion.reg_cost_grad(all_params).data(),
+                                       |x, &y| *x = *x + y);
+            cost += self.criterion.reg_cost(all_params);
         }
         (cost, gradients)
     }
diff --git a/src/learning/nnet/net_layer.rs b/src/learning/nnet/net_layer.rs
index 3bc70001..b9229c86 100644
--- a/src/learning/nnet/net_layer.rs
+++ b/src/learning/nnet/net_layer.rs
@@ -132,7 +132,7 @@ impl NetLayer for Linear {
 }
 
 impl<T: ActivationFunc + Debug> NetLayer for T {
-    /// Applys the activation function to each element of the input
+    /// Applies the activation function to each element of the input
     fn forward(&self, input: &Matrix<f64>, _: MatrixSlice<f64>) -> LearningResult<Matrix<f64>> {
         Ok(input.clone().apply(&T::func))
     }

From 66f53feb138388eeef7233827ccdbe5548e68dce Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Tue, 8 Nov 2016 00:00:26 -0800
Subject: [PATCH 23/28] Removed useless 'a and &

---
 src/learning/nnet/mod.rs       | 6 +++---
 src/learning/nnet/net_layer.rs | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/learning/nnet/mod.rs b/src/learning/nnet/mod.rs
index 1161bd71..269269e9 100644
--- a/src/learning/nnet/mod.rs
+++ b/src/learning/nnet/mod.rs
@@ -276,7 +276,7 @@ impl<T: Criterion> BaseNeuralNet<T> {
     } 
 
     /// Create a multilayer perceptron with the specified layer sizes.
-    fn mlp<'a, U>(layer_sizes: &[usize], criterion: T, activ_fn: U) -> BaseNeuralNet<T> 
+    fn mlp<U>(layer_sizes: &[usize], criterion: T, activ_fn: U) -> BaseNeuralNet<T> 
         where U: ActivationFunc + 'static {
         let mut mlp = BaseNeuralNet {
             layers: Vec::with_capacity(2*(layer_sizes.len()-1)),
@@ -365,7 +365,7 @@ impl<T: Criterion> BaseNeuralNet<T> {
             params.push(slice);
             index += layer.num_params();
         }
-        let output = &activations[activations.len()-1];
+        let output = activations.last().unwrap();
 
         // Backward propagation
         
@@ -377,7 +377,7 @@ impl<T: Criterion> BaseNeuralNet<T> {
             index -= layer.num_params();
 
             let grad_params = &mut gradients[index..index+layer.num_params()];
-            grad_params.copy_from_slice(&layer.back_params(&out_grad, activation, params[i]).data());
+            grad_params.copy_from_slice(layer.back_params(&out_grad, activation, params[i]).data());
             
             out_grad = layer.back_input(&out_grad, activation, params[i]);
         }
diff --git a/src/learning/nnet/net_layer.rs b/src/learning/nnet/net_layer.rs
index b9229c86..c963f036 100644
--- a/src/learning/nnet/net_layer.rs
+++ b/src/learning/nnet/net_layer.rs
@@ -93,7 +93,7 @@ impl NetLayer for Linear {
             }
         }
     }
-
+    
     fn back_input(&self, out_grad: &Matrix<f64>, _: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64> {
         debug_assert_eq!(out_grad.cols(), params.cols());
         let gradient = out_grad * &params.transpose();

From ccef5d38079f9dfbedb47d99bfca0613828c9027 Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Fri, 6 Jan 2017 00:43:10 -0800
Subject: [PATCH 24/28] removed useless Debug's

---
 src/learning/nnet/mod.rs       | 4 +---
 src/learning/nnet/net_layer.rs | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/learning/nnet/mod.rs b/src/learning/nnet/mod.rs
index 269269e9..22311a8d 100644
--- a/src/learning/nnet/mod.rs
+++ b/src/learning/nnet/mod.rs
@@ -58,8 +58,6 @@ use learning::toolkit::regularization::Regularization;
 use learning::optim::{Optimizable, OptimAlgorithm};
 use learning::optim::grad_desc::StochasticGD;
 
-use std::fmt::Debug;
-
 use self::net_layer::NetLayer;
 
 /// Neural Network Model
@@ -259,7 +257,7 @@ pub struct BaseNeuralNet<T: Criterion> {
 impl BaseNeuralNet<BCECriterion> {
     /// Creates a base neural network with the specified layer sizes.
     fn default<U>(layer_sizes: &[usize], activ_fn: U) -> BaseNeuralNet<BCECriterion>
-        where U: ActivationFunc + Debug + 'static {
+        where U: ActivationFunc + 'static {
         BaseNeuralNet::mlp(layer_sizes, BCECriterion::default(), activ_fn)
     }
 }
diff --git a/src/learning/nnet/net_layer.rs b/src/learning/nnet/net_layer.rs
index c963f036..2ecfb276 100644
--- a/src/learning/nnet/net_layer.rs
+++ b/src/learning/nnet/net_layer.rs
@@ -1,4 +1,4 @@
-//!Neural Network Layers
+//! Neural Network Layers
 
 use linalg::{Matrix, MatrixSlice, BaseMatrix, BaseMatrixMut};
 use rulinalg::utils;
@@ -131,7 +131,7 @@ impl NetLayer for Linear {
     }
 }
 
-impl<T: ActivationFunc + Debug> NetLayer for T {
+impl<T: ActivationFunc> NetLayer for T {
     /// Applies the activation function to each element of the input
     fn forward(&self, input: &Matrix<f64>, _: MatrixSlice<f64>) -> LearningResult<Matrix<f64>> {
         Ok(input.clone().apply(&T::func))

From c236101414a27570730bdfbdd26f43da988ea251 Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Fri, 6 Jan 2017 12:58:22 -0800
Subject: [PATCH 25/28] back_* functions now take the layer's output as an
 input

---
 src/learning/nnet/mod.rs         |  5 +++--
 src/learning/nnet/net_layer.rs   | 31 +++++++++++++++++--------------
 src/learning/toolkit/activ_fn.rs | 20 ++++++++++++++++++++
 3 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/src/learning/nnet/mod.rs b/src/learning/nnet/mod.rs
index 22311a8d..430316dc 100644
--- a/src/learning/nnet/mod.rs
+++ b/src/learning/nnet/mod.rs
@@ -372,12 +372,13 @@ impl<T: Criterion> BaseNeuralNet<T> {
         // at this point index == weights.len()
         for (i, layer) in self.layers.iter().enumerate().rev() {
             let activation = if i == 0 {inputs} else {&activations[i-1]};
+            let result = &activations[i];
             index -= layer.num_params();
 
             let grad_params = &mut gradients[index..index+layer.num_params()];
-            grad_params.copy_from_slice(layer.back_params(&out_grad, activation, params[i]).data());
+            grad_params.copy_from_slice(layer.back_params(&out_grad, activation, result, params[i]).data());
             
-            out_grad = layer.back_input(&out_grad, activation, params[i]);
+            out_grad = layer.back_input(&out_grad, activation, result, params[i]);
         }
 
         let mut cost = self.criterion.cost(output, targets);
diff --git a/src/learning/nnet/net_layer.rs b/src/learning/nnet/net_layer.rs
index 2ecfb276..1278b099 100644
--- a/src/learning/nnet/net_layer.rs
+++ b/src/learning/nnet/net_layer.rs
@@ -1,7 +1,6 @@
 //! Neural Network Layers
 
-use linalg::{Matrix, MatrixSlice, BaseMatrix, BaseMatrixMut};
-use rulinalg::utils;
+use linalg::{Matrix, MatrixSlice, BaseMatrix};
 
 use learning::LearningResult;
 use learning::error::{Error, ErrorKind};
@@ -19,10 +18,10 @@ pub trait NetLayer : Debug {
     fn forward(&self, input: &Matrix<f64>, params: MatrixSlice<f64>) -> LearningResult<Matrix<f64>>;
 
     /// The gradient of the output of this layer with respect to its input
-    fn back_input(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64>;
+    fn back_input(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, output: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64>;
     
     /// The gradient of the output of this layer with respect to its parameters
-    fn back_params(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64>;
+    fn back_params(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, output: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64>;
 
     /// The default value of the parameters of this layer before training
     fn default_params(&self) -> Vec<f64>;
@@ -94,7 +93,7 @@ impl NetLayer for Linear {
         }
     }
     
-    fn back_input(&self, out_grad: &Matrix<f64>, _: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64> {
+    fn back_input(&self, out_grad: &Matrix<f64>, _: &Matrix<f64>, _: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64> {
         debug_assert_eq!(out_grad.cols(), params.cols());
         let gradient = out_grad * &params.transpose();
         if self.has_bias {
@@ -106,7 +105,7 @@ impl NetLayer for Linear {
         }
     }
     
-    fn back_params(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
+    fn back_params(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
         debug_assert_eq!(input.rows(), out_grad.rows());
         if self.has_bias {
             &Matrix::ones(input.rows(), 1).hcat(input).transpose() * out_grad
@@ -134,18 +133,22 @@ impl NetLayer for Linear {
 impl<T: ActivationFunc> NetLayer for T {
     /// Applies the activation function to each element of the input
     fn forward(&self, input: &Matrix<f64>, _: MatrixSlice<f64>) -> LearningResult<Matrix<f64>> {
-        Ok(input.clone().apply(&T::func))
+        let mut output = Vec::with_capacity(input.rows()*input.cols());
+        for val in input.data() {
+            output.push(T::func(*val));
+        }
+        Ok(Matrix::new(input.rows(), input.cols(), output))
     }
 
-    fn back_input(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
-        let mut in_grad = input.clone();
-        utils::in_place_vec_bin_op(in_grad.mut_data(), out_grad.data(), |x, &y| {
-            *x = T::func_grad(*x) * y
-        });
-        in_grad
+    fn back_input(&self, out_grad: &Matrix<f64>, _: &Matrix<f64>, output: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
+        let mut in_grad = Vec::with_capacity(output.rows()*output.cols());
+        for (y, g) in output.data().iter().zip(out_grad.data()) {
+            in_grad.push(T::func_grad_from_output(*y) * g);
+        }
+        Matrix::new(output.rows(), output.cols(), in_grad)
     }
     
-    fn back_params(&self, _: &Matrix<f64>, _: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
+    fn back_params(&self, _: &Matrix<f64>, _: &Matrix<f64>, _: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
         Matrix::new(0, 0, Vec::new())
     }
 
diff --git a/src/learning/toolkit/activ_fn.rs b/src/learning/toolkit/activ_fn.rs
index e8e76fc1..8529898e 100644
--- a/src/learning/toolkit/activ_fn.rs
+++ b/src/learning/toolkit/activ_fn.rs
@@ -18,6 +18,10 @@ pub trait ActivationFunc: Clone + Debug {
     /// The gradient of the activation function.
     fn func_grad(x: f64) -> f64;
 
+    /// The gradient of the activation function calculated using the output of the function.
+    /// Calculates f'(x) given f(x) as an input
+    fn func_grad_from_output(y: f64) -> f64;
+
     /// The inverse of the activation function.
     fn func_inv(x: f64) -> f64;
 }
@@ -41,6 +45,10 @@ impl ActivationFunc for Sigmoid {
         Self::func(x) * (1f64 - Self::func(x))
     }
 
+    fn func_grad_from_output(y: f64) -> f64 {
+        y * (1f64 - y)
+    }
+
     fn func_inv(x: f64) -> f64 {
         (x / (1f64 - x)).ln()
     }
@@ -59,6 +67,10 @@ impl ActivationFunc for Linear {
         1f64
     }
 
+    fn func_grad_from_output(_: f64) -> f64 {
+        1f64
+    }
+
     fn func_inv(x: f64) -> f64 {
         x
     }
@@ -77,6 +89,10 @@ impl ActivationFunc for Exp {
         Self::func(x)
     }
 
+    fn func_grad_from_output(y: f64) -> f64 {
+        y
+    }
+
     fn func_inv(x: f64) -> f64 {
         x.ln()
     }
@@ -96,6 +112,10 @@ impl ActivationFunc for Tanh {
         1.0 - y*y
     }
 
+    fn func_grad_from_output(y: f64) -> f64 {
+        1.0 - y*y
+    }
+
     fn func_inv(x: f64) -> f64 {
         0.5*((1.0+x)/(1.0-x)).ln()
     }

From 775f7eb817a3f6c21bfed41ce48e9a86f92eda0b Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Sat, 21 Jan 2017 18:39:42 -0800
Subject: [PATCH 26/28] Fixed typo

---
 src/learning/nnet/net_layer.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/learning/nnet/net_layer.rs b/src/learning/nnet/net_layer.rs
index 1278b099..c0e2c3ce 100644
--- a/src/learning/nnet/net_layer.rs
+++ b/src/learning/nnet/net_layer.rs
@@ -62,7 +62,7 @@ impl Linear {
         }
     }
 
-    /// Construct a Linear layer with a bias term
+    /// Construct a Linear layer without a bias term
     pub fn without_bias(input_size: usize, output_size: usize) -> Linear {
         Linear {
             input_size: input_size, 

From 95a74c5e10b960317fe09a63bd53f5fdaab3efb4 Mon Sep 17 00:00:00 2001
From: James Lucas <LucasJ94@hotmail.co.uk>
Date: Tue, 21 Feb 2017 10:14:53 -0500
Subject: [PATCH 27/28] Minor cleanup and removing a matrix clone

---
 src/learning/nnet/mod.rs       |  8 +++-----
 src/learning/nnet/net_layer.rs | 35 ++++++++++++++++++++++++++++------
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/src/learning/nnet/mod.rs b/src/learning/nnet/mod.rs
index 430316dc..80856a2e 100644
--- a/src/learning/nnet/mod.rs
+++ b/src/learning/nnet/mod.rs
@@ -37,10 +37,10 @@
 //!
 //! The neural networks are specified via a criterion - similar to
 //! [Torch](https://github.com/torch/nn/blob/master/doc/criterion.md).
-//! The criterions combine an activation function and a cost function.
+//! The criterions specify a cost function and any regularization.
 //!
 //! You can define your own criterion by implementing the `Criterion`
-//! trait with a concrete `ActivationFunc` and `CostFunc`.
+//! trait with a concrete `CostFunc`.
 
 
 pub mod net_layer;
@@ -62,7 +62,7 @@ use self::net_layer::NetLayer;
 
 /// Neural Network Model
 ///
-/// The Neural Network struct specifies a Criterion and
+/// The Neural Network struct specifies a `Criterion` and
 /// a gradient descent algorithm.
 #[derive(Debug)]
 pub struct NeuralNet<T, A>
@@ -204,8 +204,6 @@ impl<T, A> NeuralNet<T, A>
     /// use rusty_machine::learning::toolkit::activ_fn::Sigmoid;
     /// use rusty_machine::learning::optim::grad_desc::StochasticGD;
     ///
-    /// use std::clone::Clone;
-    ///
     /// // Create a new neural net 
     /// let mut net = NeuralNet::new(BCECriterion::default(), StochasticGD::default());
     ///
diff --git a/src/learning/nnet/net_layer.rs b/src/learning/nnet/net_layer.rs
index 1278b099..a9f734d8 100644
--- a/src/learning/nnet/net_layer.rs
+++ b/src/learning/nnet/net_layer.rs
@@ -40,8 +40,8 @@ pub trait NetLayer : Debug {
 ///
 /// Represents a fully connected layer with optional bias term
 ///
-/// The parameters are a matrix of weights of size I x O
-/// where O is the dimensionality of the output and I the dimensionality of the input
+/// The parameters are a matrix of weights of size I x N
+/// where N is the dimensionality of the output and I the dimensionality of the input
 #[derive(Debug, Clone, Copy)]
 pub struct Linear { 
     /// The number of dimensions of the input
@@ -62,7 +62,7 @@ impl Linear {
         }
     }
 
-    /// Construct a Linear layer with a bias term
+    /// Construct a Linear layer without a bias term
     pub fn without_bias(input_size: usize, output_size: usize) -> Linear {
         Linear {
             input_size: input_size, 
@@ -72,6 +72,31 @@ impl Linear {
     }
 }
 
+fn remove_first_col(mat: Matrix<f64>) -> Matrix<f64>
+{
+    let rows = mat.rows();
+    let cols = mat.cols();
+    let mut data = mat.into_vec();
+
+    let len = data.len();
+    let mut del = 0;
+    {
+        let v = &mut *data;
+
+        for i in 0..len {
+            if i % cols == 0 {
+                del += 1;
+            } else if del > 0 {
+                v.swap(i - del, i);
+            }
+        }
+    }
+    if del > 0 {
+        data.truncate(len - del);
+    }
+    Matrix::new(rows, cols - 1, data)
+}
+
 impl NetLayer for Linear {
     /// Computes a matrix product
     ///
@@ -97,9 +122,7 @@ impl NetLayer for Linear {
         debug_assert_eq!(out_grad.cols(), params.cols());
         let gradient = out_grad * &params.transpose();
         if self.has_bias {
-            let rows = gradient.rows();
-            let cols = gradient.cols() - 1;
-            gradient.sub_slice([0, 1], rows, cols).into() 
+            remove_first_col(gradient)
         } else {
             gradient
         }

From 34ea5d3e6a3bbda0679ff8e5d94891cca6f0a6ca Mon Sep 17 00:00:00 2001
From: NivenT <nachenjang@gmail.com>
Date: Fri, 24 Feb 2017 22:23:02 -0800
Subject: [PATCH 28/28] small changes

---
 src/learning/nnet/mod.rs       | 33 ++++++++++++++++++---------------
 src/learning/nnet/net_layer.rs |  2 +-
 2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/learning/nnet/mod.rs b/src/learning/nnet/mod.rs
index 80856a2e..5d0356ff 100644
--- a/src/learning/nnet/mod.rs
+++ b/src/learning/nnet/mod.rs
@@ -398,30 +398,33 @@ impl<T: Criterion> BaseNeuralNet<T> {
             return Ok(inputs.clone());
         }
 
-        let mut ptr = self.weights.as_ptr();
         let mut outputs = unsafe {
             let shape = self.layers[0].param_shape();
-            let slice = MatrixSlice::from_raw_parts(ptr,
+            let slice = MatrixSlice::from_raw_parts(self.weights.as_ptr(),
                                                     shape.0,
                                                     shape.1,
                                                     shape.1);
-            ptr = ptr.offset(self.layers[0].num_params() as isize);
             try!(self.layers[0].forward(inputs, slice))
         };
+
+        let mut index = self.layers[0].num_params();
         for layer in self.layers.iter().skip(1) {
             let shape = layer.param_shape();
-            unsafe {
-                let slice = MatrixSlice::from_raw_parts(ptr,
-                                                        shape.0,
-                                                        shape.1,
-                                                        shape.1);
-                outputs = match layer.forward(&outputs, slice) {
-                    Ok(act) => act,
-                    Err(_) => {return Err(Error::new(ErrorKind::InvalidParameters,
-                        "The network's layers do not line up correctly."))}
-                };
-                ptr = ptr.offset(layer.num_params() as isize);
-            }
+
+            let slice = unsafe {
+                MatrixSlice::from_raw_parts(self.weights.as_ptr().offset(index as isize),
+                                            shape.0,
+                                            shape.1,
+                                            shape.1)
+            };
+            
+            outputs = match layer.forward(&outputs, slice) {
+                Ok(act) => act,
+                Err(_) => {return Err(Error::new(ErrorKind::InvalidParameters,
+                    "The network's layers do not line up correctly."))}
+            };
+
+            index += layer.num_params();
         }
         Ok(outputs)
     }
diff --git a/src/learning/nnet/net_layer.rs b/src/learning/nnet/net_layer.rs
index a9f734d8..8239d7ab 100644
--- a/src/learning/nnet/net_layer.rs
+++ b/src/learning/nnet/net_layer.rs
@@ -87,7 +87,7 @@ fn remove_first_col(mat: Matrix<f64>) -> Matrix<f64>
             if i % cols == 0 {
                 del += 1;
             } else if del > 0 {
-                v.swap(i - del, i);
+                v[i - del] = v[i];
             }
         }
     }