diff --git a/benches/examples/nnet.rs b/benches/examples/nnet.rs
index 0c1848da..d67e434d 100644
--- a/benches/examples/nnet.rs
+++ b/benches/examples/nnet.rs
@@ -5,6 +5,7 @@ use std::vec::Vec;
 
 use rusty_machine::learning::nnet::{NeuralNet, BCECriterion};
 use rusty_machine::learning::toolkit::regularization::Regularization;
+use rusty_machine::learning::toolkit::activ_fn::Sigmoid;
 use rusty_machine::learning::optim::grad_desc::StochasticGD;
 
 use rusty_machine::linalg::Matrix;
@@ -51,7 +52,7 @@ fn nnet_and_gate_train(b: &mut Bencher) {
     let criterion = BCECriterion::new(Regularization::L2(0.));
 
     b.iter(|| {
-        let mut model = black_box(NeuralNet::new(layers, criterion, StochasticGD::default()));
+        let mut model = black_box(NeuralNet::mlp(layers, criterion, StochasticGD::default(), Sigmoid));
         let _ = black_box(model.train(&inputs, &targets).unwrap());
     })
 }
@@ -62,7 +63,7 @@ fn nnet_and_gate_predict(b: &mut Bencher) {
     let layers = &[2, 1];
     let criterion = BCECriterion::new(Regularization::L2(0.));
 
-    let mut model = NeuralNet::new(layers, criterion, StochasticGD::default());
+    let mut model = NeuralNet::mlp(layers, criterion, StochasticGD::default(), Sigmoid);
     let _ = model.train(&inputs, &targets);
 
     b.iter(|| {
diff --git a/examples/nnet-and_gate.rs b/examples/nnet-and_gate.rs
index b4c6b156..9fd7ac5e 100644
--- a/examples/nnet-and_gate.rs
+++ b/examples/nnet-and_gate.rs
@@ -6,6 +6,7 @@ use std::vec::Vec;
 
 use rusty_machine::learning::nnet::{NeuralNet, BCECriterion};
 use rusty_machine::learning::toolkit::regularization::Regularization;
+use rusty_machine::learning::toolkit::activ_fn::Sigmoid;
 use rusty_machine::learning::optim::grad_desc::StochasticGD;
 
 use rusty_machine::linalg::Matrix;
@@ -41,7 +42,9 @@ fn main() {
 
     let layers = &[2, 1];
     let criterion = BCECriterion::new(Regularization::L2(0.));
-    let mut model = NeuralNet::new(layers, criterion, StochasticGD::default());
+    // Create a multilayer perceptron with an input layer of size 2 and output layer of size 1
+    // Uses a Sigmoid activation function and uses Stochastic gradient descent for training
+    let mut model = NeuralNet::mlp(layers, criterion, StochasticGD::default(), Sigmoid);
 
     println!("Training...");
     // Our train function returns a Result<(), E>
diff --git a/src/learning/nnet.rs b/src/learning/nnet/mod.rs
similarity index 51%
rename from src/learning/nnet.rs
rename to src/learning/nnet/mod.rs
index 3d926465..5d0356ff 100644
--- a/src/learning/nnet.rs
+++ b/src/learning/nnet/mod.rs
@@ -7,6 +7,7 @@
 //! ```
 //! use rusty_machine::learning::nnet::{NeuralNet, BCECriterion};
 //! use rusty_machine::learning::toolkit::regularization::Regularization;
+//! use rusty_machine::learning::toolkit::activ_fn::Sigmoid;
 //! use rusty_machine::learning::optim::grad_desc::StochasticGD;
 //! use rusty_machine::linalg::Matrix;
 //! use rusty_machine::learning::SupModel;
@@ -22,8 +23,8 @@
 //! // Choose the BCE criterion with L2 regularization (`lambda=0.1`).
 //! let criterion = BCECriterion::new(Regularization::L2(0.1));
 //!
-//! // We will just use the default stochastic gradient descent.
-//! let mut model = NeuralNet::new(layers, criterion, StochasticGD::default());
+//! // We will create a multilayer perceptron and just use the default stochastic gradient descent.
+//! let mut model = NeuralNet::mlp(layers, criterion, StochasticGD::default(), Sigmoid);
 //!
 //! // Train the model!
 //! model.train(&inputs, &targets).unwrap();
@@ -36,12 +37,16 @@
 //!
 //! The neural networks are specified via a criterion - similar to
 //! [Torch](https://github.com/torch/nn/blob/master/doc/criterion.md).
-//! The criterions combine an activation function and a cost function.
+//! The criterions specify a cost function and any regularization.
 //!
 //! You can define your own criterion by implementing the `Criterion`
-//! trait with a concrete `ActivationFunc` and `CostFunc`.
+//! trait with a concrete `CostFunc`.
 
-use linalg::{Matrix, MatrixSlice, BaseMatrix, BaseMatrixMut};
+
+pub mod net_layer;
+
+use linalg::{Matrix, MatrixSlice};
+use rulinalg::utils;
 
 use learning::{LearningResult, SupModel};
 use learning::error::{Error, ErrorKind};
@@ -53,28 +58,27 @@ use learning::toolkit::regularization::Regularization;
 use learning::optim::{Optimizable, OptimAlgorithm};
 use learning::optim::grad_desc::StochasticGD;
 
-use rand::thread_rng;
-use rand::distributions::{Sample, range};
+use self::net_layer::NetLayer;
 
 /// Neural Network Model
 ///
-/// The Neural Network struct specifies a Criterion and
+/// The Neural Network struct specifies a `Criterion` and
 /// a gradient descent algorithm.
 #[derive(Debug)]
-pub struct NeuralNet<'a, T, A>
+pub struct NeuralNet<T, A>
     where T: Criterion,
-          A: OptimAlgorithm<BaseNeuralNet<'a, T>>
+          A: OptimAlgorithm<BaseNeuralNet<T>>
 {
-    base: BaseNeuralNet<'a, T>,
+    base: BaseNeuralNet<T>,
     alg: A,
 }
 
 /// Supervised learning for the Neural Network.
 ///
 /// The model is trained using back propagation.
-impl<'a, T, A> SupModel<Matrix<f64>, Matrix<f64>> for NeuralNet<'a, T, A>
+impl<T, A> SupModel<Matrix<f64>, Matrix<f64>> for NeuralNet<T, A>
     where T: Criterion,
-          A: OptimAlgorithm<BaseNeuralNet<'a, T>>
+          A: OptimAlgorithm<BaseNeuralNet<T>>
 {
     /// Predict neural network output using forward propagation.
     fn predict(&self, inputs: &Matrix<f64>) -> LearningResult<Matrix<f64>> {
@@ -89,7 +93,7 @@ impl<'a, T, A> SupModel<Matrix<f64>, Matrix<f64>> for NeuralNet<'a, T, A>
     }
 }
 
-impl<'a> NeuralNet<'a, BCECriterion, StochasticGD> {
+impl NeuralNet<BCECriterion, StochasticGD> {
     /// Creates a neural network with the specified layer sizes.
     ///
     /// The layer sizes slice should include the input, hidden layers, and output layer sizes.
@@ -108,17 +112,36 @@ impl<'a> NeuralNet<'a, BCECriterion, StochasticGD> {
     /// ```
     pub fn default(layer_sizes: &[usize]) -> NeuralNet<BCECriterion, StochasticGD> {
         NeuralNet {
-            base: BaseNeuralNet::default(layer_sizes),
+            base: BaseNeuralNet::default(layer_sizes, activ_fn::Sigmoid),
             alg: StochasticGD::default(),
         }
     }
 }
 
-impl<'a, T, A> NeuralNet<'a, T, A>
+impl<T, A> NeuralNet<T, A>
     where T: Criterion,
-          A: OptimAlgorithm<BaseNeuralNet<'a, T>>
+          A: OptimAlgorithm<BaseNeuralNet<T>>
 {
-    /// Create a new neural network with the specified layer sizes.
+    /// Create a new neural network with no layers
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use rusty_machine::learning::nnet::BCECriterion;
+    /// use rusty_machine::learning::nnet::NeuralNet;
+    /// use rusty_machine::learning::optim::grad_desc::StochasticGD;
+    ///
+    /// // Create a an empty neural net
+    /// let mut net = NeuralNet::new(BCECriterion::default(), StochasticGD::default());
+    /// ```
+    pub fn new(criterion: T, alg: A) -> NeuralNet<T, A> {
+        NeuralNet {
+            base: BaseNeuralNet::new(criterion),
+            alg: alg,
+        }
+    }
+
+    /// Create a multilayer perceptron with the specified layer sizes.
     ///
     /// The layer sizes slice should include the input, hidden layers, and output layer sizes.
     /// The type of activation function must be specified.
@@ -130,19 +153,71 @@ impl<'a, T, A> NeuralNet<'a, T, A>
     /// ```
     /// use rusty_machine::learning::nnet::BCECriterion;
     /// use rusty_machine::learning::nnet::NeuralNet;
+    /// use rusty_machine::learning::toolkit::activ_fn::Sigmoid;
     /// use rusty_machine::learning::optim::grad_desc::StochasticGD;
     ///
     /// // Create a neural net with 4 layers, 3 neurons in each.
     /// let layers = &[3; 4];
-    /// let mut net = NeuralNet::new(layers, BCECriterion::default(), StochasticGD::default());
+    /// let mut net = NeuralNet::mlp(layers, BCECriterion::default(), StochasticGD::default(), Sigmoid);
     /// ```
-    pub fn new(layer_sizes: &'a [usize], criterion: T, alg: A) -> NeuralNet<'a, T, A> {
+    pub fn mlp<U>(layer_sizes: &[usize], criterion: T, alg: A, activ_fn: U) -> NeuralNet<T, A> 
+        where U: ActivationFunc + 'static {
         NeuralNet {
-            base: BaseNeuralNet::new(layer_sizes, criterion),
+            base: BaseNeuralNet::mlp(layer_sizes, criterion, activ_fn),
             alg: alg,
         }
     }
 
+    /// Adds the specified layer to the end of the network
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use rusty_machine::linalg::BaseMatrix;
+    /// use rusty_machine::learning::nnet::BCECriterion;
+    /// use rusty_machine::learning::nnet::NeuralNet;
+    /// use rusty_machine::learning::nnet::net_layer::Linear;
+    /// use rusty_machine::learning::optim::grad_desc::StochasticGD;
+    ///
+    /// // Create a new neural net 
+    /// let mut net = NeuralNet::new(BCECriterion::default(), StochasticGD::default());
+    ///
+    /// // Give net an input layer of size 3, hidden layer of size 4, and output layer of size 5
+    /// // This net will not apply any activation function to the Linear layer outputs
+    /// net.add(Box::new(Linear::new(3, 4)))
+    ///    .add(Box::new(Linear::new(4, 5)));
+    /// ```
+    pub fn add<'a>(&'a mut self, layer: Box<NetLayer>) -> &'a mut NeuralNet<T, A> {
+        self.base.add(layer);
+        self
+    }
+
+    /// Adds multiple layers to the end of the network
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use rusty_machine::linalg::BaseMatrix;
+    /// use rusty_machine::learning::nnet::BCECriterion;
+    /// use rusty_machine::learning::nnet::NeuralNet;
+    /// use rusty_machine::learning::nnet::net_layer::{NetLayer, Linear};
+    /// use rusty_machine::learning::toolkit::activ_fn::Sigmoid;
+    /// use rusty_machine::learning::optim::grad_desc::StochasticGD;
+    ///
+    /// // Create a new neural net 
+    /// let mut net = NeuralNet::new(BCECriterion::default(), StochasticGD::default());
+    ///
+    /// let linear_sig: Vec<Box<NetLayer>> = vec![Box::new(Linear::new(5, 5)), Box::new(Sigmoid)];
+    ///
+    /// // Give net a layer of size 5, followed by a Sigmoid activation function
+    /// net.add_layers(linear_sig);
+    /// ```
+    pub fn add_layers<'a, U>(&'a mut self, layers: U) -> &'a mut NeuralNet<T, A>
+        where U: IntoIterator<Item = Box<NetLayer>> {
+            self.base.add_layers(layers);
+            self
+    }
+
     /// Gets matrix of weights between specified layer and forward layer.
     ///
     /// # Examples
@@ -168,86 +243,84 @@ impl<'a, T, A> NeuralNet<'a, T, A>
 
 /// Base Neural Network struct
 ///
-/// This struct cannot be instantianated and is used internally only.
+/// This struct cannot be instantiated and is used internally only.
 #[derive(Debug)]
-pub struct BaseNeuralNet<'a, T: Criterion> {
-    layer_sizes: &'a [usize],
+pub struct BaseNeuralNet<T: Criterion> {
+    layers: Vec<Box<NetLayer>>,
     weights: Vec<f64>,
     criterion: T,
 }
 
 
-impl<'a> BaseNeuralNet<'a, BCECriterion> {
+impl BaseNeuralNet<BCECriterion> {
     /// Creates a base neural network with the specified layer sizes.
-    fn default(layer_sizes: &[usize]) -> BaseNeuralNet<BCECriterion> {
-        BaseNeuralNet::new(layer_sizes, BCECriterion::default())
+    fn default<U>(layer_sizes: &[usize], activ_fn: U) -> BaseNeuralNet<BCECriterion>
+        where U: ActivationFunc + 'static {
+        BaseNeuralNet::mlp(layer_sizes, BCECriterion::default(), activ_fn)
     }
 }
 
 
-impl<'a, T: Criterion> BaseNeuralNet<'a, T> {
-    /// Create a new base neural network with the specified layer sizes.
-    fn new(layer_sizes: &[usize], criterion: T) -> BaseNeuralNet<T> {
+impl<T: Criterion> BaseNeuralNet<T> {
+    /// Create a base neural network with no layers
+    fn new(criterion: T) -> BaseNeuralNet<T> {
         BaseNeuralNet {
-            layer_sizes: layer_sizes,
-            weights: BaseNeuralNet::<T>::create_weights(layer_sizes),
-            criterion: criterion,
+            layers: Vec::new(),
+            weights: Vec::new(),
+            criterion: criterion
         }
+    } 
+
+    /// Create a multilayer perceptron with the specified layer sizes.
+    fn mlp<U>(layer_sizes: &[usize], criterion: T, activ_fn: U) -> BaseNeuralNet<T> 
+        where U: ActivationFunc + 'static {
+        let mut mlp = BaseNeuralNet {
+            layers: Vec::with_capacity(2*(layer_sizes.len()-1)),
+            weights: Vec::new(),
+            criterion: criterion
+        };
+        for shape in layer_sizes.windows(2) {
+            mlp.add(Box::new(net_layer::Linear::new(shape[0], shape[1])));
+            mlp.add(Box::new(activ_fn.clone()));
+        }
+        mlp
+    }
+
+    /// Adds the specified layer to the end of the network
+    fn add<'a>(&'a mut self, layer: Box<NetLayer>) -> &'a mut BaseNeuralNet<T> {
+        self.weights.extend_from_slice(&layer.default_params());
+        self.layers.push(layer);
+        self
     }
 
-    /// Creates initial weights for all neurons in the network.
-    fn create_weights(layer_sizes: &[usize]) -> Vec<f64> {
-        let mut between = range::Range::new(0f64, 1f64);
-        let mut rng = thread_rng();
-        layer_sizes.windows(2)
-            .flat_map(|w| {
-                let l_in = w[0] + 1;
-                let l_out = w[1];
-                let eps_init = (6f64 / (l_in + l_out) as f64).sqrt();
-                (0..l_in * l_out)
-                    .map(|_i| (between.sample(&mut rng) * 2f64 * eps_init) - eps_init)
-                    .collect::<Vec<_>>()
-            })
-            .collect()
+    /// Adds multiple layers to the end of the network
+    fn add_layers<'a, U>(&'a mut self, layers: U) -> &'a mut BaseNeuralNet<T>
+        where U: IntoIterator<Item = Box<NetLayer>> 
+    {
+        for layer in layers {
+            self.add(layer);
+        }
+        self
     }
 
-    /// Gets matrix of weights between specified layer and forward layer for the weights.
+    /// Gets matrix of weights for the specified layer for the weights.
     fn get_layer_weights(&self, weights: &[f64], idx: usize) -> MatrixSlice<f64> {
-        debug_assert!(idx < self.layer_sizes.len() - 1);
+        debug_assert!(idx < self.layers.len());
 
         // Check that the weights are the right size.
-        let mut full_size = 0usize;
-        for l in 0..self.layer_sizes.len() - 1 {
-            full_size += (self.layer_sizes[l] + 1) * self.layer_sizes[l + 1];
-        }
+        let full_size: usize = self.layers.iter().map(|l| l.num_params()).sum();
 
         debug_assert_eq!(full_size, weights.len());
 
-        let mut start = 0usize;
-
-        for l in 0..idx {
-            start += (self.layer_sizes[l] + 1) * self.layer_sizes[l + 1]
-        }
+        let start: usize = self.layers.iter().take(idx).map(|l| l.num_params()).sum();
 
+        let shape = self.layers[idx].param_shape();
         unsafe {
             MatrixSlice::from_raw_parts(weights.as_ptr().offset(start as isize),
-                                        self.layer_sizes[idx] + 1,
-                                        self.layer_sizes[idx + 1],
-                                        self.layer_sizes[idx + 1])
+                                        shape.0,
+                                        shape.1,
+                                        shape.1)
         }
-
-    }
-
-    /// Gets matrix of weights between specified layer and forward layer
-    /// for the base model.
-    fn get_net_weights(&self, idx: usize) -> MatrixSlice<f64> {
-        self.get_layer_weights(&self.weights[..], idx)
-    }
-
-    /// Gets the weights for a layer excluding the bias weights.
-    fn get_non_bias_weights(&self, weights: &[f64], idx: usize) -> MatrixSlice<f64> {
-        let layer_weights = self.get_layer_weights(weights, idx);
-        layer_weights.sub_slice([1, 0], layer_weights.rows() - 1, layer_weights.cols())
     }
 
     /// Compute the gradient using the back propagation algorithm.
@@ -256,118 +329,110 @@ impl<'a, T: Criterion> BaseNeuralNet<'a, T> {
                     inputs: &Matrix<f64>,
                     targets: &Matrix<f64>)
                     -> (f64, Vec<f64>) {
-        assert_eq!(inputs.cols(), self.layer_sizes[0]);
-
-        let mut forward_weights = Vec::with_capacity(self.layer_sizes.len() - 1);
-        let mut activations = Vec::with_capacity(self.layer_sizes.len());
-
-        let net_data = Matrix::ones(inputs.rows(), 1).hcat(inputs);
-
-        activations.push(net_data.clone());
+        let mut gradients = Vec::with_capacity(weights.len());
+        unsafe {
+            gradients.set_len(weights.len());
+        }
+        // activations[i] is the output of layer[i]
+        let mut activations = Vec::with_capacity(self.layers.len());
+        // params[i] is the weights for layer[i]
+        let mut params = Vec::with_capacity(self.layers.len());
 
         // Forward propagation
-        {
-            let mut z = net_data * self.get_layer_weights(weights, 0);
-            forward_weights.push(z.clone());
-
-            for l in 1..self.layer_sizes.len() - 1 {
-                let mut a = self.criterion.activate(z.clone());
-                let ones = Matrix::ones(a.rows(), 1);
-
-                a = ones.hcat(&a);
-
-                z = &a * self.get_layer_weights(weights, l);
-                activations.push(a);
-                forward_weights.push(z.clone());
-            }
-
-            activations.push(self.criterion.activate(z));
+        
+        let mut index = 0;
+        for (i, layer) in self.layers.iter().enumerate() {
+            let shape = layer.param_shape();
+
+            let slice = unsafe {
+                MatrixSlice::from_raw_parts(weights.as_ptr().offset(index as isize),
+                                            shape.0,
+                                            shape.1,
+                                            shape.1)
+            };
+
+            let output = if i == 0 {
+                layer.forward(inputs, slice).unwrap()
+            } else {
+                layer.forward(activations.last().unwrap(), slice).unwrap()
+            };
+
+            activations.push(output);
+            params.push(slice);
+            index += layer.num_params();
         }
+        let output = activations.last().unwrap();
 
-        let mut deltas = Vec::with_capacity(self.layer_sizes.len() - 1);
         // Backward propagation
-        {
-            let z = forward_weights[self.layer_sizes.len() - 2].clone();
-            let g = self.criterion.grad_activ(z);
-
-            // Take GRAD_cost to compute this delta.
-            let mut delta = self.criterion
-                .cost_grad(&activations[self.layer_sizes.len() - 1], targets)
-                .elemul(&g);
-
-            deltas.push(delta.clone());
-
-            for l in (1..self.layer_sizes.len() - 1).rev() {
-                let mut z = forward_weights[l - 1].clone();
-                let ones = Matrix::ones(z.rows(), 1);
-                z = ones.hcat(&z);
-
-                let g = self.criterion.grad_activ(z);
-                delta = (delta * Matrix::from(self.get_layer_weights(weights, l)).transpose())
-                    .elemul(&g);
-
-                let non_one_rows = &(1..delta.cols()).collect::<Vec<usize>>()[..];
-                delta = delta.select_cols(non_one_rows);
-                deltas.push(delta.clone());
-            }
+        
+        // The gradient with respect to the current layer's output
+        let mut out_grad = self.criterion.cost_grad(output, targets);
+        // at this point index == weights.len()
+        for (i, layer) in self.layers.iter().enumerate().rev() {
+            let activation = if i == 0 {inputs} else {&activations[i-1]};
+            let result = &activations[i];
+            index -= layer.num_params();
+
+            let grad_params = &mut gradients[index..index+layer.num_params()];
+            grad_params.copy_from_slice(layer.back_params(&out_grad, activation, result, params[i]).data());
+            
+            out_grad = layer.back_input(&out_grad, activation, result, params[i]);
         }
 
-        let mut gradients = Vec::with_capacity(weights.len());
-
-        for (l, activ_item) in activations.iter().take(self.layer_sizes.len() - 1).enumerate() {
-            // Compute the gradient
-            let mut g = deltas[self.layer_sizes.len() - 2 - l].transpose() * activ_item;
-
-            // Add the regularized gradient
-            if self.criterion.is_regularized() {
-                let layer = l;
-                let non_bias_weights = self.get_non_bias_weights(weights, layer);
-                let zeros = Matrix::zeros(1, non_bias_weights.cols());
-                g += zeros.vcat(&self.criterion.reg_cost_grad(non_bias_weights));
-            }
-
-            gradients.append(&mut (g / inputs.rows() as f64).into_vec());
-        }
-
-        // Compute the cost
-        let mut cost = self.criterion.cost(&activations[activations.len() - 1], targets);
-
-        // Add the regularized cost
+        let mut cost = self.criterion.cost(output, targets);
         if self.criterion.is_regularized() {
-            for i in 0..self.layer_sizes.len() - 1 {
-                cost += self.criterion.reg_cost(self.get_non_bias_weights(weights, i));
-            }
+            let all_params = unsafe {
+                MatrixSlice::from_raw_parts(weights.as_ptr(), weights.len(), 1, 1)
+            };
+            utils::in_place_vec_bin_op(&mut gradients,
+                                       self.criterion.reg_cost_grad(all_params).data(),
+                                       |x, &y| *x = *x + y);
+            cost += self.criterion.reg_cost(all_params);
         }
-
         (cost, gradients)
     }
 
     /// Forward propagation of the model weights to get the outputs.
     fn forward_prop(&self, inputs: &Matrix<f64>) -> LearningResult<Matrix<f64>> {
-        if inputs.cols() != self.layer_sizes[0] {
-            Err(Error::new(ErrorKind::InvalidData,
-                           "The input data dimensions must match the first layer."))
-        } else {
-            let net_data = Matrix::ones(inputs.rows(), 1).hcat(inputs);
-
-            let mut z = net_data * self.get_net_weights(0);
-            let mut a = self.criterion.activate(z.clone());
-
-            for l in 1..self.layer_sizes.len() - 1 {
-                let ones = Matrix::ones(a.rows(), 1);
-                a = ones.hcat(&a);
-                z = a * self.get_net_weights(l);
-                a = self.criterion.activate(z.clone());
-            }
-
-            Ok(a)
+        if self.layers.is_empty() {
+            return Ok(inputs.clone());
+        }
+
+        let mut outputs = unsafe {
+            let shape = self.layers[0].param_shape();
+            let slice = MatrixSlice::from_raw_parts(self.weights.as_ptr(),
+                                                    shape.0,
+                                                    shape.1,
+                                                    shape.1);
+            try!(self.layers[0].forward(inputs, slice))
+        };
+
+        let mut index = self.layers[0].num_params();
+        for layer in self.layers.iter().skip(1) {
+            let shape = layer.param_shape();
+
+            let slice = unsafe {
+                MatrixSlice::from_raw_parts(self.weights.as_ptr().offset(index as isize),
+                                            shape.0,
+                                            shape.1,
+                                            shape.1)
+            };
+            
+            outputs = match layer.forward(&outputs, slice) {
+                Ok(act) => act,
+                Err(_) => {return Err(Error::new(ErrorKind::InvalidParameters,
+                    "The network's layers do not line up correctly."))}
+            };
+
+            index += layer.num_params();
         }
+        Ok(outputs)
     }
 }
 
 /// Compute the gradient of the Neural Network using the
 /// back propagation algorithm.
-impl<'a, T: Criterion> Optimizable for BaseNeuralNet<'a, T> {
+impl<T: Criterion> Optimizable for BaseNeuralNet<T> {
     type Inputs = Matrix<f64>;
     type Targets = Matrix<f64>;
 
@@ -385,21 +450,9 @@ impl<'a, T: Criterion> Optimizable for BaseNeuralNet<'a, T> {
 ///
 /// Specifies an activation function and a cost function.
 pub trait Criterion {
-    /// The activation function for the criterion.
-    type ActFunc: ActivationFunc;
     /// The cost function for the criterion.
     type Cost: CostFunc<Matrix<f64>>;
 
-    /// The activation function applied to a matrix.
-    fn activate(&self, mat: Matrix<f64>) -> Matrix<f64> {
-        mat.apply(&Self::ActFunc::func)
-    }
-
-    /// The gradient of the activation function applied to a matrix.
-    fn grad_activ(&self, mat: Matrix<f64>) -> Matrix<f64> {
-        mat.apply(&Self::ActFunc::func_grad)
-    }
-
     /// The cost function.
     ///
     /// Returns a scalar cost.
@@ -462,7 +515,6 @@ pub struct BCECriterion {
 }
 
 impl Criterion for BCECriterion {
-    type ActFunc = activ_fn::Sigmoid;
     type Cost = cost_fn::CrossEntropyError;
 
     fn regularization(&self) -> Regularization<f64> {
@@ -504,7 +556,6 @@ pub struct MSECriterion {
 }
 
 impl Criterion for MSECriterion {
-    type ActFunc = activ_fn::Linear;
     type Cost = cost_fn::MeanSqError;
 
     fn regularization(&self) -> Regularization<f64> {
diff --git a/src/learning/nnet/net_layer.rs b/src/learning/nnet/net_layer.rs
new file mode 100644
index 00000000..8239d7ab
--- /dev/null
+++ b/src/learning/nnet/net_layer.rs
@@ -0,0 +1,185 @@
+//! Neural Network Layers
+
+use linalg::{Matrix, MatrixSlice, BaseMatrix};
+
+use learning::LearningResult;
+use learning::error::{Error, ErrorKind};
+use learning::toolkit::activ_fn::ActivationFunc;
+
+use rand::thread_rng;
+use rand::distributions::Sample;
+use rand::distributions::normal::Normal;
+
+use std::fmt::Debug;
+
+/// Trait for neural net layers
+pub trait NetLayer : Debug {
+    /// The result of propogating data forward through this layer
+    fn forward(&self, input: &Matrix<f64>, params: MatrixSlice<f64>) -> LearningResult<Matrix<f64>>;
+
+    /// The gradient of the output of this layer with respect to its input
+    fn back_input(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, output: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64>;
+    
+    /// The gradient of the output of this layer with respect to its parameters
+    fn back_params(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, output: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64>;
+
+    /// The default value of the parameters of this layer before training
+    fn default_params(&self) -> Vec<f64>;
+
+    /// The shape of the parameters used by this layer
+    fn param_shape(&self) -> (usize, usize);
+
+    /// The number of parameters used by this layer
+    fn num_params(&self) -> usize {
+        let shape = self.param_shape();
+        shape.0 * shape.1
+    }
+}
+
+/// Linear network layer
+///
+/// Represents a fully connected layer with optional bias term
+///
+/// The parameters are a matrix of weights of size I x N
+/// where N is the dimensionality of the output and I the dimensionality of the input
+#[derive(Debug, Clone, Copy)]
+pub struct Linear { 
+    /// The number of dimensions of the input
+    input_size: usize,
+    /// The number of dimensions of the output
+    output_size: usize,
+    /// Whether or not to include a bias term
+    has_bias: bool,
+}
+
+impl Linear {
+    /// Construct a new Linear layer
+    pub fn new(input_size: usize, output_size: usize) -> Linear {
+        Linear {
+            input_size: input_size + 1, 
+            output_size: output_size,
+            has_bias: true
+        }
+    }
+
+    /// Construct a Linear layer without a bias term
+    pub fn without_bias(input_size: usize, output_size: usize) -> Linear {
+        Linear {
+            input_size: input_size, 
+            output_size: output_size,
+            has_bias: false
+        }
+    }
+}
+
+fn remove_first_col(mat: Matrix<f64>) -> Matrix<f64>
+{
+    let rows = mat.rows();
+    let cols = mat.cols();
+    let mut data = mat.into_vec();
+
+    let len = data.len();
+    let mut del = 0;
+    {
+        let v = &mut *data;
+
+        for i in 0..len {
+            if i % cols == 0 {
+                del += 1;
+            } else if del > 0 {
+                v[i - del] = v[i];
+            }
+        }
+    }
+    if del > 0 {
+        data.truncate(len - del);
+    }
+    Matrix::new(rows, cols - 1, data)
+}
+
+impl NetLayer for Linear {
+    /// Computes a matrix product
+    ///
+    /// input should have dimensions N x I
+    /// where N is the number of samples and I is the dimensionality of the input
+    fn forward(&self, input: &Matrix<f64>, params: MatrixSlice<f64>) -> LearningResult<Matrix<f64>> {
+        if self.has_bias {
+            if input.cols()+1 != params.rows() {
+                Err(Error::new(ErrorKind::InvalidData, "The input had the wrong number of columns"))
+            } else {
+                Ok(&Matrix::ones(input.rows(), 1).hcat(input) * &params)
+            }
+        } else {
+            if input.cols() != params.rows() {
+                Err(Error::new(ErrorKind::InvalidData, "The input had the wrong number of columns"))
+            } else {
+                Ok(input * &params)
+            }
+        }
+    }
+    
+    fn back_input(&self, out_grad: &Matrix<f64>, _: &Matrix<f64>, _: &Matrix<f64>, params: MatrixSlice<f64>) -> Matrix<f64> {
+        debug_assert_eq!(out_grad.cols(), params.cols());
+        let gradient = out_grad * &params.transpose();
+        if self.has_bias {
+            remove_first_col(gradient)
+        } else {
+            gradient
+        }
+    }
+    
+    fn back_params(&self, out_grad: &Matrix<f64>, input: &Matrix<f64>, _: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
+        debug_assert_eq!(input.rows(), out_grad.rows());
+        if self.has_bias {
+            &Matrix::ones(input.rows(), 1).hcat(input).transpose() * out_grad
+        } else {
+            &input.transpose() * out_grad
+        }
+    }
+
+    /// Initializes weights using Xavier initialization
+    ///
+    /// weights drawn from gaussian distribution with 0 mean and variance 2/(input_size+output_size)
+    fn default_params(&self) -> Vec<f64> {
+        let mut distro = Normal::new(0.0, (2.0/(self.input_size+self.output_size) as f64).sqrt());
+        let mut rng = thread_rng();
+
+        (0..self.input_size*self.output_size).map(|_| distro.sample(&mut rng))
+                                             .collect()
+    }
+
+    fn param_shape(&self) -> (usize, usize) {
+        (self.input_size, self.output_size)
+    }
+}
+
+impl<T: ActivationFunc> NetLayer for T {
+    /// Applies the activation function to each element of the input
+    fn forward(&self, input: &Matrix<f64>, _: MatrixSlice<f64>) -> LearningResult<Matrix<f64>> {
+        let mut output = Vec::with_capacity(input.rows()*input.cols());
+        for val in input.data() {
+            output.push(T::func(*val));
+        }
+        Ok(Matrix::new(input.rows(), input.cols(), output))
+    }
+
+    fn back_input(&self, out_grad: &Matrix<f64>, _: &Matrix<f64>, output: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
+        let mut in_grad = Vec::with_capacity(output.rows()*output.cols());
+        for (y, g) in output.data().iter().zip(out_grad.data()) {
+            in_grad.push(T::func_grad_from_output(*y) * g);
+        }
+        Matrix::new(output.rows(), output.cols(), in_grad)
+    }
+    
+    fn back_params(&self, _: &Matrix<f64>, _: &Matrix<f64>, _: &Matrix<f64>, _: MatrixSlice<f64>) -> Matrix<f64> {
+        Matrix::new(0, 0, Vec::new())
+    }
+
+    fn default_params(&self) -> Vec<f64> {
+        Vec::new()
+    }
+
+    fn param_shape(&self) -> (usize, usize) {
+        (0, 0)
+    }
+}
diff --git a/src/learning/toolkit/activ_fn.rs b/src/learning/toolkit/activ_fn.rs
index 0ffe4569..8529898e 100644
--- a/src/learning/toolkit/activ_fn.rs
+++ b/src/learning/toolkit/activ_fn.rs
@@ -8,14 +8,20 @@
 //! You can also create your own custom activation Functions for use in your models.
 //! Just create a unit struct implementing the `ActivationFunc` trait.
 
+use std::fmt::Debug;
+
 /// Trait for activation functions in models.
-pub trait ActivationFunc {
+pub trait ActivationFunc: Clone + Debug {
     /// The activation function.
     fn func(x: f64) -> f64;
 
     /// The gradient of the activation function.
     fn func_grad(x: f64) -> f64;
 
+    /// The gradient of the activation function calculated using the output of the function.
+    /// Calculates f'(x) given f(x) as an input
+    fn func_grad_from_output(y: f64) -> f64;
+
     /// The inverse of the activation function.
     fn func_inv(x: f64) -> f64;
 }
@@ -39,6 +45,10 @@ impl ActivationFunc for Sigmoid {
         Self::func(x) * (1f64 - Self::func(x))
     }
 
+    fn func_grad_from_output(y: f64) -> f64 {
+        y * (1f64 - y)
+    }
+
     fn func_inv(x: f64) -> f64 {
         (x / (1f64 - x)).ln()
     }
@@ -57,6 +67,10 @@ impl ActivationFunc for Linear {
         1f64
     }
 
+    fn func_grad_from_output(_: f64) -> f64 {
+        1f64
+    }
+
     fn func_inv(x: f64) -> f64 {
         x
     }
@@ -75,7 +89,34 @@ impl ActivationFunc for Exp {
         Self::func(x)
     }
 
+    fn func_grad_from_output(y: f64) -> f64 {
+        y
+    }
+
     fn func_inv(x: f64) -> f64 {
         x.ln()
     }
 }
+
+/// Hyperbolic tangent activation function
+#[derive(Clone, Copy, Debug)]
+pub struct Tanh;
+
+impl ActivationFunc for Tanh {
+    fn func(x: f64) -> f64 {
+        x.tanh()
+    }
+
+    fn func_grad(x: f64) -> f64 {
+        let y = x.tanh();
+        1.0 - y*y
+    }
+
+    fn func_grad_from_output(y: f64) -> f64 {
+        1.0 - y*y
+    }
+
+    fn func_inv(x: f64) -> f64 {
+        0.5*((1.0+x)/(1.0-x)).ln()
+    }
+}
\ No newline at end of file