diff --git a/Makefile b/Makefile
index e42c75ee1e8..ca5fff2c4c7 100644
--- a/Makefile
+++ b/Makefile
@@ -86,27 +86,37 @@ CUDA_LIB_DIR := $(CUDA_DIR)/lib64 $(CUDA_DIR)/lib
 MKL_INCLUDE_DIR := $(MKL_DIR)/include
 MKL_LIB_DIR := $(MKL_DIR)/lib $(MKL_DIR)/lib/intel64
 
-INCLUDE_DIRS += ./src ./include $(CUDA_INCLUDE_DIR) $(MKL_INCLUDE_DIR)
-LIBRARY_DIRS += $(CUDA_LIB_DIR) $(MKL_LIB_DIR)
+INCLUDE_DIRS += ./src ./include $(CUDA_INCLUDE_DIR)
+LIBRARY_DIRS += $(CUDA_LIB_DIR)
 LIBRARIES := cudart cublas curand \
-	mkl_rt \
 	pthread \
-	glog protobuf leveldb \
-	snappy \
+	glog protobuf leveldb snappy \
 	boost_system \
 	hdf5_hl hdf5 \
 	opencv_core opencv_highgui opencv_imgproc
 PYTHON_LIBRARIES := boost_python python2.7
 WARNINGS := -Wall
 
-COMMON_FLAGS := -DNDEBUG -O2 $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
+COMMON_FLAGS := -DNDEBUG -O2
+
+# MKL switch (default = non-MKL)
+USE_MKL ?= 0
+ifeq ($(USE_MKL), 1)
+  LIBRARIES += mkl_rt
+  COMMON_FLAGS += -DUSE_MKL
+  INCLUDE_DIRS += $(MKL_INCLUDE_DIR)
+  LIBRARY_DIRS += $(MKL_LIB_DIR)
+else
+  LIBRARIES += cblas atlas
+endif
+
+COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
 CXXFLAGS += -pthread -fPIC $(COMMON_FLAGS)
 NVCCFLAGS := -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS)
 LDFLAGS += $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) \
 		$(foreach library,$(LIBRARIES),-l$(library))
 PYTHON_LDFLAGS := $(LDFLAGS) $(foreach library,$(PYTHON_LIBRARIES),-l$(library))
 
-
 ##############################
 # Define build targets
 ##############################
@@ -210,6 +220,10 @@ $(BUILD_DIR)/src/gtest/%.o: src/gtest/%.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 	@echo
 
+$(BUILD_DIR)/src/$(PROJECT)/%.cuo: src/$(PROJECT)/%.cu
+	$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@
+	@echo
+
 $(BUILD_DIR)/src/$(PROJECT)/layers/%.cuo: src/$(PROJECT)/layers/%.cu
 	$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@
 	@echo
diff --git a/Makefile.config.example b/Makefile.config.example
index cec85e0a7f7..95656dd0ad1 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -10,6 +10,8 @@ CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
     -gencode arch=compute_30,code=sm_30 \
     -gencode arch=compute_35,code=sm_35
 
+# MKL switch: set to 1 for MKL
+USE_MKL := 0
 # MKL directory contains include/ and lib/ directions that we need.
 MKL_DIR := /opt/intel/mkl
 
diff --git a/docs/feature_extraction.md b/docs/feature_extraction.md
new file mode 100644
index 00000000000..7671fffa5a8
--- /dev/null
+++ b/docs/feature_extraction.md
@@ -0,0 +1,67 @@
+---
+layout: default
+title: Caffe
+---
+
+Extracting Features
+===================
+
+In this tutorial, we will extract features using a pre-trained model.
+Follow instructions for [setting up caffe](installation.html) and for [getting](getting_pretrained_models.html) the pre-trained ImageNet model.
+If you need detailed information about the tools below, please consult their source code, in which additional documentation is usually provided.
+
+Select data to run on
+---------------------
+
+We'll make a temporary folder to store things into.
+
+    mkdir examples/_temp
+
+Generate a list of the files to process.
+We're going to use the images that ship with caffe.
+
+    find `pwd`/examples/images -type f -exec echo {} \; > examples/_temp/file_list.txt
+
+The `ImagesLayer` we'll use expects labels after each filenames, so let's add a 0 to the end of each line
+
+    sed "s/$/ 0/" examples/_temp/file_list.txt > examples/_temp/file_list.txt
+
+Define the Feature Extraction Network Architecture
+--------------------------------------------------
+
+In practice, subtracting the mean image from a dataset significantly improves classification accuracies.
+Download the mean image of the ILSVRC dataset.
+
+    data/ilsvrc12/get_ilsvrc_aux.sh
+
+We will use `data/ilsvrc212/imagenet_mean.binaryproto` in the network definition prototxt.
+
+Let's copy and modify the network definition.
+We'll be using the `ImagesLayer`, which will load and resize images for us.
+
+    cp examples/feature_extraction/imagenet_val.prototxt examples/_temp
+
+Edit `examples/_temp/imagenet_val.prototxt` to use correct path for your setup (replace `$CAFFE_DIR`)
+
+Extract Features
+----------------
+
+Now everything necessary is in place.
+
+    build/tools/extract_features.bin models/caffe_reference_imagenet_model examples/_temp/imagenet_val.prototxt fc7 examples/_temp/features 10
+
+The name of feature blob that you extract is `fc7`, which represents the highest level feature of the reference model.
+We can use any other layer, as well, such as `conv5` or `pool3`.
+
+The last parameter above is the number of data mini-batches.
+
+The features are stored to LevelDB `examples/_temp/features`, ready for access by some other code.
+
+If you'd like to use the Python wrapper for extracting features, check out the [layer visualization notebook](http://nbviewer.ipython.org/github/BVLC/caffe/blob/master/examples/filter_visualization.ipynb).
+
+Clean Up
+--------
+
+Let's remove the temporary directory now.
+
+    rm -r examples/_temp
diff --git a/docs/index.md b/docs/index.md
index 3db9dbaf20e..98c266c668a 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -33,6 +33,7 @@ Even in CPU mode, computing predictions on an image takes only 20 ms when images
 * [LeNet / MNIST Demo](/mnist.html): end-to-end training and testing of LeNet on MNIST.
 * [CIFAR-10 Demo](/cifar10.html): training and testing on the CIFAR-10 data.
 * [Training ImageNet](/imagenet_training.html): end-to-end training of an ImageNet classifier.
+* [Feature extraction with C++](/feature_extraction.html): feature extraction using pre-trained model
 * [Running Pretrained ImageNet \[notebook\]][pretrained_imagenet]: run classification with the pretrained ImageNet model using the Python interface.
 * [Running Detection \[notebook\]][imagenet_detection]: run a pretrained model as a detector.
 * [Visualizing Features and Filters \[notebook\]][visualizing_filters]: trained filters and an example image, viewed layer-by-layer.
diff --git a/examples/feature_extraction/imagenet_val.prototxt b/examples/feature_extraction/imagenet_val.prototxt
new file mode 100644
index 00000000000..c7b26509125
--- /dev/null
+++ b/examples/feature_extraction/imagenet_val.prototxt
@@ -0,0 +1,247 @@
+name: "CaffeNet"
+layers {
+  layer {
+    name: "data"
+    type: "images"
+    source: "$CAFFE_DIR/examples/_temp/file_list.txt"
+    meanfile: "$CAFFE_DIR/data/ilsvrc12/imagenet_mean.binaryproto"
+    batchsize: 50
+    new_height: 256
+    new_width: 256
+    mirror: false
+    cropsize: 227
+  }
+  top: "data"
+  top: "label"
+}
+layers {
+  layer {
+    name: "conv1"
+    type: "conv"
+    num_output: 96
+    kernelsize: 11
+    stride: 4
+  }
+  bottom: "data"
+  top: "conv1"
+}
+layers {
+  layer {
+    name: "relu1"
+    type: "relu"
+  }
+  bottom: "conv1"
+  top: "conv1"
+}
+layers {
+  layer {
+    name: "pool1"
+    type: "pool"
+    pool: MAX
+    kernelsize: 3
+    stride: 2
+  }
+  bottom: "conv1"
+  top: "pool1"
+}
+layers {
+  layer {
+    name: "norm1"
+    type: "lrn"
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+  bottom: "pool1"
+  top: "norm1"
+}
+layers {
+  layer {
+    name: "conv2"
+    type: "conv"
+    num_output: 256
+    group: 2
+    kernelsize: 5
+    pad: 2
+  }
+  bottom: "norm1"
+  top: "conv2"
+}
+layers {
+  layer {
+    name: "relu2"
+    type: "relu"
+  }
+  bottom: "conv2"
+  top: "conv2"
+}
+layers {
+  layer {
+    name: "pool2"
+    type: "pool"
+    pool: MAX
+    kernelsize: 3
+    stride: 2
+  }
+  bottom: "conv2"
+  top: "pool2"
+}
+layers {
+  layer {
+    name: "norm2"
+    type: "lrn"
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+  bottom: "pool2"
+  top: "norm2"
+}
+layers {
+  layer {
+    name: "conv3"
+    type: "conv"
+    num_output: 384
+    kernelsize: 3
+    pad: 1
+  }
+  bottom: "norm2"
+  top: "conv3"
+}
+layers {
+  layer {
+    name: "relu3"
+    type: "relu"
+  }
+  bottom: "conv3"
+  top: "conv3"
+}
+layers {
+  layer {
+    name: "conv4"
+    type: "conv"
+    num_output: 384
+    group: 2
+    kernelsize: 3
+    pad: 1
+  }
+  bottom: "conv3"
+  top: "conv4"
+}
+layers {
+  layer {
+    name: "relu4"
+    type: "relu"
+  }
+  bottom: "conv4"
+  top: "conv4"
+}
+layers {
+  layer {
+    name: "conv5"
+    type: "conv"
+    num_output: 256
+    group: 2
+    kernelsize: 3
+    pad: 1
+  }
+  bottom: "conv4"
+  top: "conv5"
+}
+layers {
+  layer {
+    name: "relu5"
+    type: "relu"
+  }
+  bottom: "conv5"
+  top: "conv5"
+}
+layers {
+  layer {
+    name: "pool5"
+    type: "pool"
+    kernelsize: 3
+    pool: MAX
+    stride: 2
+  }
+  bottom: "conv5"
+  top: "pool5"
+}
+layers {
+  layer {
+    name: "fc6"
+    type: "innerproduct"
+    num_output: 4096
+  }
+  bottom: "pool5"
+  top: "fc6"
+}
+layers {
+  layer {
+    name: "relu6"
+    type: "relu"
+  }
+  bottom: "fc6"
+  top: "fc6"
+}
+layers {
+  layer {
+    name: "drop6"
+    type: "dropout"
+    dropout_ratio: 0.5
+  }
+  bottom: "fc6"
+  top: "fc6"
+}
+layers {
+  layer {
+    name: "fc7"
+    type: "innerproduct"
+    num_output: 4096
+  }
+  bottom: "fc6"
+  top: "fc7"
+}
+layers {
+  layer {
+    name: "relu7"
+    type: "relu"
+  }
+  bottom: "fc7"
+  top: "fc7"
+}
+layers {
+  layer {
+    name: "drop7"
+    type: "dropout"
+    dropout_ratio: 0.5
+  }
+  bottom: "fc7"
+  top: "fc7"
+}
+layers {
+  layer {
+    name: "fc8"
+    type: "innerproduct"
+    num_output: 1000
+  }
+  bottom: "fc7"
+  top: "fc8"
+}
+layers {
+  layer {
+    name: "prob"
+    type: "softmax"
+  }
+  bottom: "fc8"
+  top: "prob"
+}
+layers {
+  layer {
+    name: "accuracy"
+    type: "accuracy"
+  }
+  bottom: "prob"
+  bottom: "label"
+  top: "accuracy"
+}
diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index f31d3b0f693..75cc3c67288 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -27,6 +27,14 @@ class Blob {
   inline int count() const {return count_; }
   inline int offset(const int n, const int c = 0, const int h = 0,
       const int w = 0) const {
+    CHECK_GE(n, 0);
+    CHECK_LE(n, num_);
+    CHECK_GE(channels_, 0);
+    CHECK_LE(c, channels_);
+    CHECK_GE(height_, 0);
+    CHECK_LE(h, height_);
+    CHECK_GE(width_, 0);
+    CHECK_LE(w, width_);
     return ((n * channels_ + c) * height_ + h) * width_ + w;
   }
   // Copy from source. If copy_diff is false, we copy the data; if copy_diff
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 96ba58c2716..5344139c551 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -1,4 +1,4 @@
-// Copyright 2013 Yangqing Jia
+// Copyright 2014 BVLC and contributors.
 
 #ifndef CAFFE_COMMON_HPP_
 #define CAFFE_COMMON_HPP_
@@ -7,28 +7,8 @@
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <curand.h>
-// cuda driver types
-#include <driver_types.h>
+#include <driver_types.h>  // cuda driver types
 #include <glog/logging.h>
-#include <mkl_vsl.h>
-
-// various checks for different function calls.
-#define CUDA_CHECK(condition) CHECK_EQ((condition), cudaSuccess)
-#define CUBLAS_CHECK(condition) CHECK_EQ((condition), CUBLAS_STATUS_SUCCESS)
-#define CURAND_CHECK(condition) CHECK_EQ((condition), CURAND_STATUS_SUCCESS)
-#define VSL_CHECK(condition) CHECK_EQ((condition), VSL_STATUS_OK)
-
-#define CUDA_KERNEL_LOOP(i, n) \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
-       i < (n); \
-       i += blockDim.x * gridDim.x)
-
-// After a kernel is executed, this will check the error and if there is one,
-// exit loudly.
-#define CUDA_POST_KERNEL_CHECK \
-  if (cudaSuccess != cudaPeekAtLastError()) \
-    LOG(FATAL) << "Cuda kernel failed. Error: " \
-        << cudaGetErrorString(cudaPeekAtLastError())
 
 // Disable the copy and assignment operator for a class.
 #define DISABLE_COPY_AND_ASSIGN(classname) \
@@ -45,6 +25,23 @@ private:\
 // is executed we will see a fatal log.
 #define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented Yet"
 
+// CUDA: various checks for different function calls.
+#define CUDA_CHECK(condition) CHECK_EQ((condition), cudaSuccess)
+#define CUBLAS_CHECK(condition) CHECK_EQ((condition), CUBLAS_STATUS_SUCCESS)
+#define CURAND_CHECK(condition) CHECK_EQ((condition), CURAND_STATUS_SUCCESS)
+
+// CUDA: grid stride looping
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n); \
+       i += blockDim.x * gridDim.x)
+
+// CUDA: check for error after kernel execution and exit loudly if there is one.
+#define CUDA_POST_KERNEL_CHECK \
+  if (cudaSuccess != cudaPeekAtLastError()) \
+    LOG(FATAL) << "Cuda kernel failed. Error: " \
+        << cudaGetErrorString(cudaPeekAtLastError())
+
 
 namespace caffe {
 
@@ -53,20 +50,6 @@ namespace caffe {
 using boost::shared_ptr;
 
 
-// We will use 1024 threads per block, which requires cuda sm_2x or above.
-#if __CUDA_ARCH__ >= 200
-    const int CAFFE_CUDA_NUM_THREADS = 1024;
-#else
-    const int CAFFE_CUDA_NUM_THREADS = 512;
-#endif
-
-
-
-inline int CAFFE_GET_BLOCKS(const int N) {
-  return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
-}
-
-
 // A singleton class to hold common caffe stuff, such as the handler that
 // caffe is going to use for cublas, curand, etc.
 class Caffe {
@@ -81,15 +64,32 @@ class Caffe {
   enum Brew { CPU, GPU };
   enum Phase { TRAIN, TEST };
 
-  // The getters for the variables.
-  // Returns the cublas handle.
+
+  // This random number generator facade hides boost and CUDA rng
+  // implementation from one another (for cross-platform compatibility).
+  class RNG {
+   public:
+    RNG();
+    explicit RNG(unsigned int seed);
+    ~RNG();
+    RNG(const RNG&);
+    RNG& operator=(const RNG&);
+    const void* generator() const;
+    void* generator();
+   private:
+    class Generator;
+    Generator* generator_;
+  };
+
+  // Getters for boost rng, curand, and cublas handles
+  inline static RNG &rng_stream() {
+    return Get().random_generator_;
+  }
   inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; }
-  // Returns the curand generator.
   inline static curandGenerator_t curand_generator() {
     return Get().curand_generator_;
   }
-  // Returns the MKL random stream.
-  inline static VSLStreamStatePtr vsl_stream() { return Get().vsl_stream_; }
+
   // Returns the mode: running on CPU or GPU.
   inline static Brew mode() { return Get().mode_; }
   // Returns the phase: TRAIN or TEST.
@@ -102,7 +102,7 @@ class Caffe {
   inline static void set_mode(Brew mode) { Get().mode_ = mode; }
   // Sets the phase.
   inline static void set_phase(Phase phase) { Get().phase_ = phase; }
-  // Sets the random seed of both MKL and curand
+  // Sets the random seed of both boost and curand
   static void set_random_seed(const unsigned int seed);
   // Sets the device. Since we have cublas and curand stuff, set device also
   // requires us to reset those values.
@@ -113,7 +113,8 @@ class Caffe {
  protected:
   cublasHandle_t cublas_handle_;
   curandGenerator_t curand_generator_;
-  VSLStreamStatePtr vsl_stream_;
+  RNG random_generator_;
+
   Brew mode_;
   Phase phase_;
   static shared_ptr<Caffe> singleton_;
@@ -126,6 +127,21 @@ class Caffe {
 };
 
 
+// CUDA: thread number configuration.
+// Use 1024 threads per block, which requires cuda sm_2x or above,
+// or fall back to attempt compatibility (best of luck to you).
+#if __CUDA_ARCH__ >= 200
+    const int CAFFE_CUDA_NUM_THREADS = 1024;
+#else
+    const int CAFFE_CUDA_NUM_THREADS = 512;
+#endif
+
+// CUDA: number of blocks for threads.
+inline int CAFFE_GET_BLOCKS(const int N) {
+  return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
+}
+
+
 }  // namespace caffe
 
 #endif  // CAFFE_COMMON_HPP_
diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index 5b934a331e3..7c1002245d0 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -7,7 +7,6 @@
 #ifndef CAFFE_FILLER_HPP
 #define CAFFE_FILLER_HPP
 
-#include <mkl.h>
 #include <string>
 
 #include "caffe/common.hpp"
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index a0cb487e50d..6aaab6fe1b3 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -7,6 +7,7 @@
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
 #include "caffe/proto/caffe.pb.h"
+#include "caffe/regularizer.hpp"
 
 using std::vector;
 
@@ -28,6 +29,12 @@ class Layer {
           blobs_[i]->FromProto(layer_param_.blobs(i));
         }
       }
+      if (layer_param_.regularizer_size() > 0) {
+        regularizers_.resize(layer_param_.regularizer_size());
+        for (int i = 0; i < layer_param_.regularizer_size(); ++i) {
+          regularizers_[i].reset(GetRegularizer<Dtype>(param.regularizer(i)));
+        }
+      }
     }
   virtual ~Layer() {}
   // SetUp: your function should implement this.
@@ -37,9 +44,9 @@ class Layer {
   // Forward and backward wrappers. You should implement the cpu and
   // gpu specific implementations instead, and should not change these
   // functions.
-  inline void Forward(const vector<Blob<Dtype>*>& bottom,
+  inline Dtype Forward(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  inline Dtype Backward(const vector<Blob<Dtype>*>& top,
+  inline void Backward(const vector<Blob<Dtype>*>& top,
       const bool propagate_down,
       vector<Blob<Dtype>*>* bottom);
 
@@ -58,28 +65,30 @@ class Layer {
   LayerParameter layer_param_;
   // The vector that stores the parameters as a set of blobs.
   vector<shared_ptr<Blob<Dtype> > > blobs_;
+  // The vector that stores the regularizers.
+  vector<shared_ptr<Regularizer<Dtype> > > regularizers_;
 
   // Forward functions
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) = 0;
   // If no gpu code is provided, we will simply use cpu code.
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
     // LOG(WARNING) << "Using CPU code as backup.";
-    Forward_cpu(bottom, top);
+    return Forward_cpu(bottom, top);
   }
 
   // Backward functions: the backward function will compute the gradients for
   // any parameters and also for the bottom blobs if propagate_down is true.
   // It will return the loss produced from this layer.
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down,
       vector<Blob<Dtype>*>* bottom) = 0;
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down,
       vector<Blob<Dtype>*>* bottom) {
     // LOG(WARNING) << "Using CPU code as backup.";
-    return Backward_cpu(top, propagate_down, bottom);
+    Backward_cpu(top, propagate_down, bottom);
   }
 
   DISABLE_COPY_AND_ASSIGN(Layer);
@@ -89,29 +98,38 @@ class Layer {
 // gpu specific implementations instead, and should not change these
 // functions.
 template <typename Dtype>
-inline void Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
+inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
+  Dtype loss;
   switch (Caffe::mode()) {
   case Caffe::CPU:
-    Forward_cpu(bottom, top);
+    loss = Forward_cpu(bottom, top);
     break;
   case Caffe::GPU:
-    Forward_gpu(bottom, top);
+    loss = Forward_gpu(bottom, top);
     break;
   default:
-    LOG(FATAL) << "Unknown caffe mode.";
+    LOG(FATAL) << "Unknown caffe mode " << Caffe::mode();
   }
+  if (layer_param_.regularizer_size() > 0) {
+    for (int i = 0; i < layer_param_.regularizer_size(); ++i) {
+      loss += regularizers_[i]->Regularize(bottom[0]);
+    }
+  }
+  return loss;
 }
 
 template <typename Dtype>
-inline Dtype Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
+inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
     const bool propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   switch (Caffe::mode()) {
   case Caffe::CPU:
-    return Backward_cpu(top, propagate_down, bottom);
+    Backward_cpu(top, propagate_down, bottom);
+    break;
   case Caffe::GPU:
-    return Backward_gpu(top, propagate_down, bottom);
+    Backward_gpu(top, propagate_down, bottom);
+    break;
   default:
     LOG(FATAL) << "Unknown caffe mode.";
   }
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index b5a57b3c5a4..81fe25d2854 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -31,21 +31,24 @@ class Net {
 
   // Run forward with the input blobs already fed separately. You can get the
   // input blobs using input_blobs().
-  const vector<Blob<Dtype>*>& ForwardPrefilled();
+  const vector<Blob<Dtype>*>& ForwardPrefilled(Dtype* loss = NULL);
   // Run forward using a set of bottom blobs, and return the result.
-  const vector<Blob<Dtype>*>& Forward(const vector<Blob<Dtype>* > & bottom);
+  const vector<Blob<Dtype>*>& Forward(const vector<Blob<Dtype>* > & bottom,
+      Dtype* loss = NULL);
   // Run forward using a serialized BlobProtoVector and return the result
   // as a serialized BlobProtoVector
-  string Forward(const string& input_blob_protos);
+  string Forward(const string& input_blob_protos, Dtype* loss = NULL);
 
   // The network backward should take no input and output, since it solely
   // computes the gradient w.r.t the parameters, and the data has already
   // been provided during the forward pass.
-  Dtype Backward();
+  void Backward();
 
   Dtype ForwardBackward(const vector<Blob<Dtype>* > & bottom) {
-    Forward(bottom);
-    return Backward();
+    Dtype loss;
+    Forward(bottom, &loss);
+    Backward();
+    return loss;
   }
 
   // Updates the network weights based on the diff values computed.
@@ -82,6 +85,13 @@ class Net {
   inline int num_outputs() { return net_output_blobs_.size(); }
   inline vector<Blob<Dtype>*>& input_blobs() { return net_input_blobs_; }
   inline vector<Blob<Dtype>*>& output_blobs() { return net_output_blobs_; }
+  // has_blob and blob_by_name are inspired by
+  // https://github.com/kencoken/caffe/commit/f36e71569455c9fbb4bf8a63c2d53224e32a4e7b
+  // Access intermediary computation layers, testing with centre image only
+  bool has_blob(const string& blob_name);
+  const shared_ptr<Blob<Dtype> > blob_by_name(const string& blob_name);
+  bool has_layer(const string& layer_name);
+  const shared_ptr<Layer<Dtype> > layer_by_name(const string& layer_name);
 
  protected:
   // Function to get misc parameters, e.g. the learning rate multiplier and
@@ -91,11 +101,13 @@ class Net {
   // Individual layers in the net
   vector<shared_ptr<Layer<Dtype> > > layers_;
   vector<string> layer_names_;
+  map<string, int> layer_names_index_;
   vector<bool> layer_need_backward_;
   // blobs stores the blobs that store intermediate results between the
   // layers.
   vector<shared_ptr<Blob<Dtype> > > blobs_;
   vector<string> blob_names_;
+  map<string, int> blob_names_index_;
   vector<bool> blob_need_backward_;
   // bottom_vecs stores the vectors containing the input for each layer.
   // They don't actually host the blobs (blobs_ does), so we simply store
diff --git a/include/caffe/regularizer.hpp b/include/caffe/regularizer.hpp
new file mode 100644
index 00000000000..ac524aec28e
--- /dev/null
+++ b/include/caffe/regularizer.hpp
@@ -0,0 +1,75 @@
+// Copyright 2014 kloudkl@github
+
+#ifndef CAFFE_REGULARIZER_HPP_
+#define CAFFE_REGULARIZER_HPP_
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+template<typename Dtype>
+class Regularizer {
+ public:
+  explicit Regularizer(const RegularizerParameter& param)
+      : coeff_(Dtype(param.coeff())) {
+    if (coeff_ < 0) {
+      LOG(FATAL)<<
+      "Regularizer coefficient must be greater than or equal to zero";
+    }
+  }
+
+  virtual ~Regularizer() {
+  }
+
+  virtual Dtype Regularize(Blob<Dtype>* bottom);
+  virtual Dtype Regularize_cpu(Blob<Dtype>* bottom) = 0;
+  virtual Dtype Regularize_gpu(Blob<Dtype>* bottom) = 0;
+
+  inline Dtype coeff() {
+    return coeff_;
+  }
+  inline void set_coeff(const Dtype coeff) {
+    coeff_ = coeff;
+  }
+
+ protected:
+  // the weight regularization coefficient
+  Dtype coeff_;
+  DISABLE_COPY_AND_ASSIGN(Regularizer);
+};
+
+#define MAKE_SIMPLE_REGULARIZER_CLASS(type) \
+template<typename Dtype> \
+class type##Regularizer : public Regularizer<Dtype> { \
+  /* NOLINT_NEXT_LINE(whitespace/indent) */ \
+ public: \
+  type##Regularizer(const RegularizerParameter& param) \
+      : Regularizer<Dtype>(param) { \
+  } \
+  \
+  virtual ~type##Regularizer() { \
+  } \
+  \
+  virtual Dtype Regularize_cpu(Blob<Dtype>* bottom); \
+  virtual Dtype Regularize_gpu(Blob<Dtype>* bottom); \
+  \
+  /* NOLINT_NEXT_LINE(whitespace/indent) */ \
+ protected: \
+  DISABLE_COPY_AND_ASSIGN(type##Regularizer); \
+}
+
+MAKE_SIMPLE_REGULARIZER_CLASS(L1);
+MAKE_SIMPLE_REGULARIZER_CLASS(L2);
+MAKE_SIMPLE_REGULARIZER_CLASS(MaxNorm);
+
+#define REG_TYPE(type) REG_TYPE_PASTE(type)
+#define REG_TYPE_PASTE(type) RegularizerParameter_RegularizerType_##type
+
+template<typename Dtype>
+Regularizer<Dtype>* GetRegularizer(const RegularizerParameter& param);
+
+}  // namespace caffe
+
+#endif  // CAFFE_REGULARIZER_HPP_
diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp
index 7bf78977d6d..e5405727ee4 100644
--- a/include/caffe/util/io.hpp
+++ b/include/caffe/util/io.hpp
@@ -15,6 +15,8 @@
 using std::string;
 using ::google::protobuf::Message;
 
+#define HDF5_NUM_DIMS 4
+
 namespace caffe {
 
 void ReadProtoFromTextFile(const char* filename,
@@ -60,6 +62,10 @@ void hdf5_load_nd_dataset(
   hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
   Blob<Dtype>* blob);
 
+template <typename Dtype>
+void hdf5_save_nd_dataset(
+  const hid_t file_id, const string dataset_name, const Blob<Dtype>& blob);
+
 }  // namespace caffe
 
 #endif   // CAFFE_UTIL_IO_H_
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index e9e2db8f274..81097ef9774 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -1,10 +1,14 @@
 // Copyright 2013 Yangqing Jia
+// Copyright 2014 kloudkl@github
 
 #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_
 #define CAFFE_UTIL_MATH_FUNCTIONS_H_
 
-#include <mkl.h>
 #include <cublas_v2.h>
+#include <math.h>  // for signbit
+#include <cmath>  // for std::fabs
+
+#include "caffe/util/mkl_alternate.hpp"
 
 namespace caffe {
 
@@ -44,7 +48,7 @@ void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X,
     Dtype* Y);
 
 template <typename Dtype>
-void caffe_axpby(const int N, const Dtype alpha, const Dtype* X,
+void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X,
     const Dtype beta, Dtype* Y);
 
 template <typename Dtype>
@@ -84,6 +88,9 @@ void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 template <typename Dtype>
 void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
+template <typename Dtype>
+Dtype caffe_nextafter(const Dtype b);
+
 template <typename Dtype>
 void caffe_vRngUniform(const int n, Dtype* r, const Dtype a, const Dtype b);
 
@@ -91,6 +98,9 @@ template <typename Dtype>
 void caffe_vRngGaussian(const int n, Dtype* r, const Dtype a,
     const Dtype sigma);
 
+template <typename Dtype>
+void caffe_vRngBernoulli(const int n, Dtype* r, const double p);
+
 template <typename Dtype>
 void caffe_exp(const int n, const Dtype* a, Dtype* y);
 
@@ -100,6 +110,91 @@ Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y);
 template <typename Dtype>
 void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out);
 
+template <typename Dtype>
+int caffe_hamming_distance(const int n, const Dtype* x, const Dtype* y);
+
+// Returns the sum of the absolute values of the elements of vector x
+template <typename Dtype>
+Dtype caffe_cpu_asum(const int n, const Dtype* x);
+
+template <typename Dtype>
+void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y);
+
+// the branchless, type-safe version from
+// http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c
+template<typename Dtype>
+inline char caffe_sign(Dtype val) {
+  return (Dtype(0) < val) - (val < Dtype(0));
+}
+
+// The following two macros are modifications of DEFINE_VSL_UNARY_FUNC
+//   in include/caffe/util/mkl_alternate.hpp authored by @Rowland Depp.
+// Please refer to commit 7e8ef25c7 of the boost-eigen branch.
+// Git cherry picking that commit caused a conflict hard to resolve and
+//   copying that file in convenient for code reviewing.
+// So they have to be pasted here temporarily.
+#define DEFINE_CAFFE_CPU_UNARY_FUNC(name, operation) \
+  template<typename Dtype> \
+  void caffe_cpu_##name(const int n, const Dtype* x, Dtype* y) { \
+    CHECK_GT(n, 0); CHECK(x); CHECK(y); \
+    for (int i = 0; i < n; ++i) { \
+      operation; \
+    } \
+  }
+
+#define INSTANTIATE_CAFFE_CPU_UNARY_FUNC(name) \
+  template <> \
+  void caffe_cpu_##name<float>(const int n, const float* x, float* y); \
+  template <> \
+  void caffe_cpu_##name<double>(const int n, const double* x, double* y)
+
+
+#define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \
+template<typename Dtype> \
+__global__ void name##_kernel(const int n, const Dtype* x, Dtype* y) { \
+  int index = threadIdx.x + blockIdx.x * blockDim.x; \
+  if (index < n) { \
+    operation; \
+  } \
+} \
+template <> \
+void caffe_gpu_##name<float>(const int n, const float* x, float* y) { \
+  /* NOLINT_NEXT_LINE(whitespace/operators) */ \
+  name##_kernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>( \
+      n, x, y); \
+} \
+template <> \
+void caffe_gpu_##name<double>(const int n, const double* x, double* y) { \
+  /* NOLINT_NEXT_LINE(whitespace/operators) */ \
+  name##_kernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>( \
+      n, x, y); \
+}
+
+// output is 1 for the positives, 0 for zero, and -1 for the negatives
+DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]));
+
+template<typename Dtype>
+void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y);
+
+// This returns a nonzero value if the input has its sign bit set.
+// The name sngbit is meant to avoid conflicts with std::signbit in the macro
+using std::signbit;
+DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, y[i] = signbit(x[i]));
+
+template<typename Dtype>
+void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y);
+
+DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]));
+
+template <typename Dtype>
+void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y);
+
+template <typename Dtype>
+void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
+
 }  // namespace caffe
 
 
diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp
new file mode 100644
index 00000000000..39038dd148e
--- /dev/null
+++ b/include/caffe/util/mkl_alternate.hpp
@@ -0,0 +1,97 @@
+// Copyright 2013 Rowland Depp
+
+#ifndef CAFFE_UTIL_MKL_ALTERNATE_H_
+#define CAFFE_UTIL_MKL_ALTERNATE_H_
+
+#ifdef USE_MKL
+
+#include <mkl.h>
+
+#else  // If use MKL, simply include the MKL header
+
+extern "C" {
+#include <cblas.h>
+}
+#include <math.h>
+
+// Functions that caffe uses but are not present if MKL is not linked.
+
+// A simple way to define the vsl unary functions. The operation should
+// be in the form e.g. y[i] = sqrt(a[i])
+#define DEFINE_VSL_UNARY_FUNC(name, operation) \
+  template<typename Dtype> \
+  void v##name(const int n, const Dtype* a, Dtype* y) { \
+    CHECK_GT(n, 0); CHECK(a); CHECK(y); \
+    for (int i = 0; i < n; ++i) { operation; } \
+  } \
+  inline void vs##name( \
+    const int n, const float* a, float* y) { \
+    v##name<float>(n, a, y); \
+  } \
+  inline void vd##name( \
+      const int n, const double* a, double* y) { \
+    v##name<double>(n, a, y); \
+  }
+
+DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i]);
+DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i]));
+
+// A simple way to define the vsl unary functions with singular parameter b.
+// The operation should be in the form e.g. y[i] = pow(a[i], b)
+#define DEFINE_VSL_UNARY_FUNC_WITH_PARAM(name, operation) \
+  template<typename Dtype> \
+  void v##name(const int n, const Dtype* a, const Dtype b, Dtype* y) { \
+    CHECK_GT(n, 0); CHECK(a); CHECK(y); \
+    for (int i = 0; i < n; ++i) { operation; } \
+  } \
+  inline void vs##name( \
+    const int n, const float* a, const float b, float* y) { \
+    v##name<float>(n, a, b, y); \
+  } \
+  inline void vd##name( \
+      const int n, const double* a, const float b, double* y) { \
+    v##name<double>(n, a, b, y); \
+  }
+
+DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b));
+
+// A simple way to define the vsl binary functions. The operation should
+// be in the form e.g. y[i] = a[i] + b[i]
+#define DEFINE_VSL_BINARY_FUNC(name, operation) \
+  template<typename Dtype> \
+  void v##name(const int n, const Dtype* a, const Dtype* b, Dtype* y) { \
+    CHECK_GT(n, 0); CHECK(a); CHECK(b); CHECK(y); \
+    for (int i = 0; i < n; ++i) { operation; } \
+  } \
+  inline void vs##name( \
+    const int n, const float* a, const float* b, float* y) { \
+    v##name<float>(n, a, b, y); \
+  } \
+  inline void vd##name( \
+      const int n, const double* a, const double* b, double* y) { \
+    v##name<double>(n, a, b, y); \
+  }
+
+DEFINE_VSL_BINARY_FUNC(Add, y[i] = a[i] + b[i]);
+DEFINE_VSL_BINARY_FUNC(Sub, y[i] = a[i] - b[i]);
+DEFINE_VSL_BINARY_FUNC(Mul, y[i] = a[i] * b[i]);
+DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]);
+
+// In addition, MKL comes with an additional function axpby that is not present
+// in standard blas. We will simply use a two-step (inefficient, of course) way
+// to mimic that.
+inline void cblas_saxpby(const int N, const float alpha, const float* X,
+                         const int incX, const float beta, float* Y,
+                         const int incY) {
+  cblas_sscal(N, beta, Y, incY);
+  cblas_saxpy(N, alpha, X, incX, Y, incY);
+}
+inline void cblas_daxpby(const int N, const double alpha, const double* X,
+                         const int incX, const double beta, double* Y,
+                         const int incY) {
+  cblas_dscal(N, beta, Y, incY);
+  cblas_daxpy(N, alpha, X, incX, Y, incY);
+}
+
+#endif  // USE_MKL
+#endif  // CAFFE_UTIL_MKL_ALTERNATE_H_
diff --git a/include/caffe/util/rng.hpp b/include/caffe/util/rng.hpp
new file mode 100644
index 00000000000..8151a9a6f67
--- /dev/null
+++ b/include/caffe/util/rng.hpp
@@ -0,0 +1,19 @@
+// Copyright 2014 BVLC and contributors.
+
+#ifndef CAFFE_RNG_CPP_HPP_
+#define CAFFE_RNG_CPP_HPP_
+
+#include <boost/random/mersenne_twister.hpp>
+#include "caffe/common.hpp"
+
+namespace caffe {
+
+  typedef boost::mt19937 rng_t;
+  inline rng_t& caffe_rng() {
+    Caffe::RNG &generator = Caffe::rng_stream();
+    return *(caffe::rng_t*) generator.generator();
+  }
+
+}  // namespace caffe
+
+#endif  // CAFFE_RNG_HPP_
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 90e2caa664f..0d5bf4e79a7 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -14,6 +14,10 @@
 
 #include "caffe/layer.hpp"
 #include "caffe/proto/caffe.pb.h"
+#include "caffe/regularizer.hpp"
+
+#define HDF5_DATA_DATASET_NAME "data"
+#define HDF5_DATA_LABEL_NAME "label"
 
 namespace caffe {
 
@@ -37,14 +41,14 @@ class ReLULayer : public NeuronLayer<Dtype> {
       : NeuronLayer<Dtype>(param) {}
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
 
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
 };
 
@@ -55,14 +59,14 @@ class TanHLayer : public NeuronLayer<Dtype> {
       : NeuronLayer<Dtype>(param) {}
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
 
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
 };
 
@@ -73,14 +77,14 @@ class SigmoidLayer : public NeuronLayer<Dtype> {
       : NeuronLayer<Dtype>(param) {}
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
 
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
 };
 
@@ -92,14 +96,14 @@ class BNLLLayer : public NeuronLayer<Dtype> {
       : NeuronLayer<Dtype>(param) {}
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
 
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
 };
 
@@ -113,14 +117,14 @@ class DropoutLayer : public NeuronLayer<Dtype> {
       vector<Blob<Dtype>*>* top);
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
 
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
   shared_ptr<SyncedMemory> rand_vec_;
   float threshold_;
@@ -138,13 +142,13 @@ class SplitLayer : public Layer<Dtype> {
       vector<Blob<Dtype>*>* top);
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
   int count_;
 };
@@ -159,13 +163,13 @@ class FlattenLayer : public Layer<Dtype> {
       vector<Blob<Dtype>*>* top);
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
   int count_;
 };
@@ -180,14 +184,14 @@ class InnerProductLayer : public Layer<Dtype> {
       vector<Blob<Dtype>*>* top);
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
 
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
   int M_;
   int K_;
@@ -206,13 +210,13 @@ class PaddingLayer : public Layer<Dtype> {
       vector<Blob<Dtype>*>* top);
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
   unsigned int PAD_;
   int NUM_;
@@ -233,13 +237,13 @@ class LRNLayer : public Layer<Dtype> {
       vector<Blob<Dtype>*>* top);
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
   // scale_ stores the intermediate summing results
   Blob<Dtype> scale_;
@@ -263,13 +267,13 @@ class Im2colLayer : public Layer<Dtype> {
       vector<Blob<Dtype>*>* top);
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
   int KSIZE_;
   int STRIDE_;
@@ -288,13 +292,13 @@ class PoolingLayer : public Layer<Dtype> {
       vector<Blob<Dtype>*>* top);
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
   int KSIZE_;
   int STRIDE_;
@@ -316,13 +320,13 @@ class ConvolutionLayer : public Layer<Dtype> {
       vector<Blob<Dtype>*>* top);
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
   Blob<Dtype> col_bob_;
 
@@ -352,13 +356,13 @@ class ConcatLayer : public Layer<Dtype> {
       vector<Blob<Dtype>*>* top);
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
   Blob<Dtype> col_bob_;
 
@@ -387,14 +391,14 @@ class DataLayer : public Layer<Dtype> {
       vector<Blob<Dtype>*>* top);
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) { return; }
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) { return; }
 
   shared_ptr<leveldb::DB> db_;
   shared_ptr<leveldb::Iterator> iter_;
@@ -425,14 +429,14 @@ class ImagesLayer : public Layer<Dtype> {
       vector<Blob<Dtype>*>* top);
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) { return; }
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) { return; }
 
   vector<std::pair<std::string, int> > lines_;
   int lines_id_;
@@ -457,13 +461,13 @@ class HDF5DataLayer : public Layer<Dtype> {
       vector<Blob<Dtype>*>* top);
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
   virtual void load_hdf5_file_data(const char* filename);
 
@@ -477,6 +481,33 @@ class HDF5DataLayer : public Layer<Dtype> {
 };
 
 
+template <typename Dtype>
+class HDF5OutputLayer : public Layer<Dtype> {
+ public:
+  explicit HDF5OutputLayer(const LayerParameter& param);
+  virtual ~HDF5OutputLayer();
+  virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  inline std::string file_name() const { return file_name_; }
+
+ protected:
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
+  virtual void SaveBlobs();
+
+  std::string file_name_;
+  hid_t file_id_;
+  Blob<Dtype> data_blob_;
+  Blob<Dtype> label_blob_;
+};
+
+
 template <typename Dtype>
 class SoftmaxLayer : public Layer<Dtype> {
  public:
@@ -486,13 +517,13 @@ class SoftmaxLayer : public Layer<Dtype> {
       vector<Blob<Dtype>*>* top);
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
 
   // sum_multiplier is just used to carry out sum using blas
@@ -513,13 +544,13 @@ class MultinomialLogisticLossLayer : public Layer<Dtype> {
  protected:
   // The loss layer will do nothing during forward - all computation are
   // carried out in the backward pass.
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) { return; }
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) { return; }
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  // virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  //     vector<Blob<Dtype>*>* top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  // virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  // virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
   //     const bool propagate_down, vector<Blob<Dtype>*>* bottom);
 };
 
@@ -534,13 +565,13 @@ class InfogainLossLayer : public Layer<Dtype> {
  protected:
   // The loss layer will do nothing during forward - all computation are
   // carried out in the backward pass.
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) { return; }
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) { return; }
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  // virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  //     vector<Blob<Dtype>*>* top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  // virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  // virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
   //     const bool propagate_down, vector<Blob<Dtype>*>* bottom);
 
   Blob<Dtype> infogain_;
@@ -561,13 +592,13 @@ class SoftmaxWithLossLayer : public Layer<Dtype> {
       vector<Blob<Dtype>*>* top);
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
 
   shared_ptr<SoftmaxLayer<Dtype> > softmax_layer_;
@@ -590,17 +621,46 @@ class EuclideanLossLayer : public Layer<Dtype> {
  protected:
   // The loss layer will do nothing during forward - all computation are
   // carried out in the backward pass.
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) { return; }
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) { return; }
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  // virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  //     vector<Blob<Dtype>*>* top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  // virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+  // virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
   //     const bool propagate_down, vector<Blob<Dtype>*>* bottom);
   Blob<Dtype> difference_;
 };
 
+// The most natural places should the Regularizer subclasses
+//   be used are in the Layer::Backward* methods.
+// The most beneficial use case is to succinctly test this layer
+//   following the practice in test_regularizer_as_loss_layer.cpp
+//   instead of testing the Regularizers for every other kind of layer
+//   which would be combination explosion.
+// If you do want to use this layer as an independent layer in a network model,
+//   be cautious that it may incur unnecessary extra memory usage compared
+//   with the recommended method.
+template <typename Dtype>
+class RegularizerAsLossLayer : public Layer<Dtype> {
+ public:
+  explicit RegularizerAsLossLayer(const LayerParameter& param);
+  virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+
+ protected:
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+     const bool propagate_down, vector<Blob<Dtype>*>* bottom);
+
+  vector<shared_ptr<Regularizer<Dtype> > > regularizers_;
+  int num_regularizers_;
+};
 
 template <typename Dtype>
 class AccuracyLayer : public Layer<Dtype> {
@@ -611,13 +671,12 @@ class AccuracyLayer : public Layer<Dtype> {
       vector<Blob<Dtype>*>* top);
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
   // The accuracy layer should not be used to compute backward operations.
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
     NOT_IMPLEMENTED;
-    return Dtype(0.);
   }
 };
 
@@ -638,14 +697,14 @@ class WindowDataLayer : public Layer<Dtype> {
       vector<Blob<Dtype>*>* top);
 
  protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) { return; }
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) { return; }
 
   pthread_t thread_;
   shared_ptr<Blob<Dtype> > prefetch_data_;
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index f47173afcae..59cbc56b61c 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -1,15 +1,17 @@
-// Copyright 2013 Yangqing Jia
+// Copyright 2014 BVLC and contributors.
 
 #include <cstdio>
 #include <ctime>
 
 #include "caffe/common.hpp"
+#include "caffe/util/rng.hpp"
 
 namespace caffe {
 
 shared_ptr<Caffe> Caffe::singleton_;
 
 
+// curand seeding
 int64_t cluster_seedgen(void) {
   int64_t s, seed, pid;
   pid = getpid();
@@ -21,7 +23,8 @@ int64_t cluster_seedgen(void) {
 
 Caffe::Caffe()
     : mode_(Caffe::CPU), phase_(Caffe::TRAIN), cublas_handle_(NULL),
-      curand_generator_(NULL), vsl_stream_(NULL) {
+      curand_generator_(NULL),
+      random_generator_() {
   // Try to create a cublas handler, and report an error if failed (but we will
   // keep the program running as one might just want to run CPU code).
   if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
@@ -34,13 +37,6 @@ Caffe::Caffe()
       != CURAND_STATUS_SUCCESS) {
     LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
   }
-  // Try to create a vsl stream. This should almost always work, but we will
-  // check it anyway.
-  if (vslNewStream(&vsl_stream_, VSL_BRNG_MT19937,
-                   cluster_seedgen()) != VSL_STATUS_OK) {
-    LOG(ERROR) << "Cannot create vsl stream. VSL random number generator "
-        << "won't be available.";
-  }
 }
 
 Caffe::~Caffe() {
@@ -48,7 +44,6 @@ Caffe::~Caffe() {
   if (curand_generator_) {
     CURAND_CHECK(curandDestroyGenerator(curand_generator_));
   }
-  if (vsl_stream_) VSL_CHECK(vslDeleteStream(&vsl_stream_));
 }
 
 void Caffe::set_random_seed(const unsigned int seed) {
@@ -64,9 +59,8 @@ void Caffe::set_random_seed(const unsigned int seed) {
   } else {
     LOG(ERROR) << "Curand not available. Skipping setting the curand seed.";
   }
-  // VSL seed
-  VSL_CHECK(vslDeleteStream(&(Get().vsl_stream_)));
-  VSL_CHECK(vslNewStream(&(Get().vsl_stream_), VSL_BRNG_MT19937, seed));
+  // RNG seed
+  Get().random_generator_ = RNG(seed);
 }
 
 void Caffe::SetDevice(const int device_id) {
@@ -120,4 +114,37 @@ void Caffe::DeviceQuery() {
   return;
 }
 
+
+class Caffe::RNG::Generator {
+ public:
+  caffe::rng_t rng;
+};
+
+Caffe::RNG::RNG()
+: generator_(new Generator) { }
+
+Caffe::RNG::RNG(unsigned int seed)
+: generator_(new Generator) {
+  generator_->rng = caffe::rng_t(seed);
+}
+
+Caffe::RNG::~RNG() { delete generator_; }
+
+Caffe::RNG::RNG(const RNG& other) : generator_(new Generator) {
+  *generator_ = *other.generator_;
+}
+
+Caffe::RNG& Caffe::RNG::operator=(const RNG& other) {
+  *generator_ = *other.generator_;
+  return *this;
+}
+
+void* Caffe::RNG::generator() {
+  return &generator_->rng;
+}
+
+const void* Caffe::RNG::generator() const {
+  return &generator_->rng;
+}
+
 }  // namespace caffe
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index 54e90d21034..efc7c0b64f6 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -37,6 +37,8 @@ Layer<Dtype>* GetLayer(const LayerParameter& param) {
     return new FlattenLayer<Dtype>(param);
   } else if (type == "hdf5_data") {
     return new HDF5DataLayer<Dtype>(param);
+  } else if (type == "hdf5_output") {
+    return new HDF5OutputLayer<Dtype>(param);
   } else if (type == "images") {
     return new ImagesLayer<Dtype>(param);
   } else if (type == "im2col") {
@@ -53,6 +55,8 @@ Layer<Dtype>* GetLayer(const LayerParameter& param) {
     return new PaddingLayer<Dtype>(param);
   } else if (type == "pool") {
     return new PoolingLayer<Dtype>(param);
+  } else if (type == "regularizer_as_loss") {
+    return new RegularizerAsLossLayer<Dtype>(param);
   } else if (type == "relu") {
     return new ReLULayer<Dtype>(param);
   } else if (type == "sigmoid") {
diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp
index b769a35212a..e7a4fba2d67 100644
--- a/src/caffe/layers/bnll_layer.cpp
+++ b/src/caffe/layers/bnll_layer.cpp
@@ -13,7 +13,7 @@ namespace caffe {
 const float kBNLL_THRESHOLD = 50.;
 
 template <typename Dtype>
-void BNLLLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype BNLLLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = (*top)[0]->mutable_cpu_data();
@@ -23,10 +23,11 @@ void BNLLLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
         bottom_data[i] + log(1. + exp(-bottom_data[i])) :
         log(1. + exp(bottom_data[i]));
   }
+  return Dtype(0);
 }
 
 template <typename Dtype>
-Dtype BNLLLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void BNLLLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   if (propagate_down) {
@@ -40,7 +41,6 @@ Dtype BNLLLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       bottom_diff[i] = top_diff[i] * expval / (expval + 1.);
     }
   }
-  return Dtype(0);
 }
 
 
diff --git a/src/caffe/layers/bnll_layer.cu b/src/caffe/layers/bnll_layer.cu
index 1fd200894c3..7252c0222c0 100644
--- a/src/caffe/layers/bnll_layer.cu
+++ b/src/caffe/layers/bnll_layer.cu
@@ -22,7 +22,7 @@ __global__ void BNLLForward(const int n, const Dtype* in, Dtype* out) {
 }
 
 template <typename Dtype>
-void BNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype BNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = (*top)[0]->mutable_gpu_data();
@@ -31,6 +31,7 @@ void BNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   BNLLForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
       count, bottom_data, top_data);
   CUDA_POST_KERNEL_CHECK;
+  return Dtype(0);
 }
 
 template <typename Dtype>
@@ -43,7 +44,7 @@ __global__ void BNLLBackward(const int n, const Dtype* in_diff,
 }
 
 template <typename Dtype>
-Dtype BNLLLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void BNLLLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   if (propagate_down) {
@@ -56,7 +57,6 @@ Dtype BNLLLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         count, top_diff, bottom_data, bottom_diff);
     CUDA_POST_KERNEL_CHECK;
   }
-  return Dtype(0);
 }
 
 INSTANTIATE_CLASS(BNLLLayer);
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
index dc949c14010..e65451061b0 100644
--- a/src/caffe/layers/concat_layer.cpp
+++ b/src/caffe/layers/concat_layer.cpp
@@ -42,7 +42,7 @@ void ConcatLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void ConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype ConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   Dtype* top_data = (*top)[0]->mutable_cpu_data();
   if (concat_dim_== 0) {
@@ -69,10 +69,11 @@ void ConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     LOG(FATAL) << "concat_dim along dim" << concat_dim_ <<
       " not implemented yet";
   }
+  return Dtype(0.);
 }
 
 template <typename Dtype>
-Dtype ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
   if (concat_dim_ == 0) {
@@ -100,7 +101,6 @@ Dtype ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     LOG(FATAL) << "concat_dim along dim" << concat_dim_ <<
       " not implemented yet";
   }
-  return Dtype(0.);
 }
 
 INSTANTIATE_CLASS(ConcatLayer);
diff --git a/src/caffe/layers/concat_layer.cu b/src/caffe/layers/concat_layer.cu
index 616a5e61683..8a20cea64cf 100644
--- a/src/caffe/layers/concat_layer.cu
+++ b/src/caffe/layers/concat_layer.cu
@@ -9,7 +9,7 @@
 namespace caffe {
 
 template <typename Dtype>
-void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   Dtype* top_data = (*top)[0]->mutable_gpu_data();
   if (concat_dim_ == 0) {
@@ -36,10 +36,11 @@ void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     LOG(FATAL) << "concat_dim along dim" << concat_dim_ <<
       " not implemented yet";
   }
+  return Dtype(0.);
 }
 
 template <typename Dtype>
-Dtype ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   const Dtype* top_diff = top[0]->gpu_diff();
   if (concat_dim_ == 0) {
@@ -67,7 +68,6 @@ Dtype ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     LOG(FATAL) << "concat_dim along dim" << concat_dim_ <<
       " not implemented yet";
   }
-  return Dtype(0.);
 }
 
 INSTANTIATE_CLASS(ConcatLayer);
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 64a652a8e1d..cb1bca6579c 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -77,7 +77,7 @@ void ConvolutionLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 
 
 template <typename Dtype>
-void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = (*top)[0]->mutable_cpu_data();
@@ -104,10 +104,11 @@ void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
           (Dtype)1., top_data + (*top)[0]->offset(n));
     }
   }
+  return Dtype(0.);
 }
 
 template <typename Dtype>
-Dtype ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
   const Dtype* weight = this->blobs_[0]->cpu_data();
@@ -159,7 +160,6 @@ Dtype ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
           bottom_diff + (*bottom)[0]->offset(n));
     }
   }
-  return Dtype(0.);
 }
 
 INSTANTIATE_CLASS(ConvolutionLayer);
diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu
index a7f56faa97b..f8f605584d1 100644
--- a/src/caffe/layers/conv_layer.cu
+++ b/src/caffe/layers/conv_layer.cu
@@ -11,7 +11,7 @@
 namespace caffe {
 
 template <typename Dtype>
-void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = (*top)[0]->mutable_gpu_data();
@@ -38,10 +38,11 @@ void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
           (Dtype)1., top_data + (*top)[0]->offset(n));
     }
   }
+  return Dtype(0.);
 }
 
 template <typename Dtype>
-Dtype ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   const Dtype* top_diff = top[0]->gpu_diff();
   const Dtype* weight = this->blobs_[0]->gpu_data();
@@ -95,7 +96,6 @@ Dtype ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
           bottom_diff + (*bottom)[0]->offset(n));
     }
   }
-  return Dtype(0.);
 }
 
 
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index cc03cdbf0b7..f2ff7ff1d93 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -213,7 +213,7 @@ void DataLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   // First, join the thread
   CHECK(!pthread_join(thread_, NULL)) << "Pthread joining failed.";
@@ -225,12 +225,6 @@ void DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   // Start a new prefetch thread
   CHECK(!pthread_create(&thread_, NULL, DataLayerPrefetch<Dtype>,
       reinterpret_cast<void*>(this))) << "Pthread execution failed.";
-}
-
-// The backward operations are dummy - they do not carry any computation.
-template <typename Dtype>
-Dtype DataLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   return Dtype(0.);
 }
 
diff --git a/src/caffe/layers/data_layer.cu b/src/caffe/layers/data_layer.cu
index 946f30f3b7f..57a375ea205 100644
--- a/src/caffe/layers/data_layer.cu
+++ b/src/caffe/layers/data_layer.cu
@@ -16,7 +16,7 @@ using std::string;
 namespace caffe {
 
 template <typename Dtype>
-void DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   // First, join the thread
   CHECK(!pthread_join(thread_, NULL)) << "Pthread joining failed.";
@@ -30,12 +30,6 @@ void DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   // Start a new prefetch thread
   CHECK(!pthread_create(&thread_, NULL, DataLayerPrefetch<Dtype>,
       reinterpret_cast<void*>(this))) << "Pthread execution failed.";
-}
-
-// The backward operations are dummy - they do not carry any computation.
-template <typename Dtype>
-Dtype DataLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   return Dtype(0.);
 }
 
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index f480853cdf3..f07547ad81a 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -3,6 +3,7 @@
 #include <vector>
 
 #include "caffe/common.hpp"
+#include "caffe/util/math_functions.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/syncedmem.hpp"
 #include "caffe/vision_layers.hpp"
@@ -23,7 +24,7 @@ void DropoutLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = (*top)[0]->mutable_cpu_data();
@@ -31,18 +32,18 @@ void DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   const int count = bottom[0]->count();
   if (Caffe::phase() == Caffe::TRAIN) {
     // Create random numbers
-    viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
-        count, mask, 1. - threshold_);
+    caffe_vRngBernoulli<int>(count, mask, 1. - threshold_);
     for (int i = 0; i < count; ++i) {
       top_data[i] = bottom_data[i] * mask[i] * scale_;
     }
   } else {
     memcpy(top_data, bottom_data, bottom[0]->count() * sizeof(Dtype));
   }
+  return Dtype(0);
 }
 
 template <typename Dtype>
-Dtype DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   CHECK(Caffe::phase() == Caffe::TRAIN);
@@ -55,7 +56,6 @@ Dtype DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       bottom_diff[i] = top_diff[i] * mask[i] * scale_;
     }
   }
-  return Dtype(0);
 }
 
 
diff --git a/src/caffe/layers/dropout_layer.cu b/src/caffe/layers/dropout_layer.cu
index 0b38ae2a576..dc1f3cf8740 100644
--- a/src/caffe/layers/dropout_layer.cu
+++ b/src/caffe/layers/dropout_layer.cu
@@ -24,7 +24,7 @@ __global__ void DropoutForward(const int n, const Dtype* in,
 }
 
 template <typename Dtype>
-void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = (*top)[0]->mutable_gpu_data();
@@ -42,6 +42,7 @@ void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     CUDA_CHECK(cudaMemcpy(top_data, bottom_data,
         count * sizeof(Dtype), cudaMemcpyDeviceToDevice));
   }
+  return Dtype(0);
 }
 
 template <typename Dtype>
@@ -54,7 +55,7 @@ __global__ void DropoutBackward(const int n, const Dtype* in_diff,
 }
 
 template <typename Dtype>
-Dtype DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   CHECK(Caffe::phase() == Caffe::TRAIN);
@@ -68,7 +69,6 @@ Dtype DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         count, top_diff, mask, uint_thres_, scale_, bottom_diff);
     CUDA_POST_KERNEL_CHECK;
   }
-  return Dtype(0);
 }
 
 INSTANTIATE_CLASS(DropoutLayer);
diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp
index 9e17a8200c1..d8d5c4b6053 100644
--- a/src/caffe/layers/flatten_layer.cpp
+++ b/src/caffe/layers/flatten_layer.cpp
@@ -22,20 +22,20 @@ void FlattenLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void FlattenLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype FlattenLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = (*top)[0]->mutable_cpu_data();
   caffe_copy(count_, bottom_data, top_data);
+  return Dtype(0.);
 }
 
 template <typename Dtype>
-Dtype FlattenLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void FlattenLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
   Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
   caffe_copy(count_, top_diff, bottom_diff);
-  return Dtype(0.);
 }
 
 INSTANTIATE_CLASS(FlattenLayer);
diff --git a/src/caffe/layers/flatten_layer.cu b/src/caffe/layers/flatten_layer.cu
index 571e22e2417..fa1e6aa3141 100644
--- a/src/caffe/layers/flatten_layer.cu
+++ b/src/caffe/layers/flatten_layer.cu
@@ -9,20 +9,20 @@
 namespace caffe {
 
 template <typename Dtype>
-void FlattenLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype FlattenLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = (*top)[0]->mutable_gpu_data();
   caffe_gpu_copy(count_, bottom_data, top_data);
+  return Dtype(0.);
 }
 
 template <typename Dtype>
-Dtype FlattenLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void FlattenLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   const Dtype* top_diff = top[0]->gpu_diff();
   Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
   caffe_gpu_copy(count_, top_diff, bottom_diff);
-  return Dtype(0.);
 }
 
 INSTANTIATE_CLASS(FlattenLayer);
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index e5b17fedb20..3f87dbc512e 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -89,7 +89,7 @@ void HDF5DataLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   const int batchsize = this->layer_param_.batchsize();
   const int data_count = (*top)[0]->count() / (*top)[0]->num();
@@ -118,14 +118,13 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
             &label_blob_.cpu_data()[current_row_ * label_data_count],
             sizeof(Dtype) * label_data_count);
   }
+  return Dtype(0.);
 }
 
 // The backward operations are dummy - they do not carry any computation.
 template <typename Dtype>
-Dtype HDF5DataLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
-  return Dtype(0.);
-}
+void HDF5DataLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) { }
 
 INSTANTIATE_CLASS(HDF5DataLayer);
 
diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu
index bed7f35a156..261d404d551 100644
--- a/src/caffe/layers/hdf5_data_layer.cu
+++ b/src/caffe/layers/hdf5_data_layer.cu
@@ -20,7 +20,7 @@ using std::string;
 namespace caffe {
 
 template <typename Dtype>
-void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   const int batchsize = this->layer_param_.batchsize();
   const int data_count = (*top)[0]->count() / (*top)[0]->num();
@@ -53,12 +53,12 @@ void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
             sizeof(Dtype) * label_data_count,
             cudaMemcpyHostToDevice));
   }
+  return Dtype(0.);
 }
 
 template <typename Dtype>
-Dtype HDF5DataLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void HDF5DataLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
-  return Dtype(0.);
 }
 
 INSTANTIATE_CLASS(HDF5DataLayer);
diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp
new file mode 100644
index 00000000000..f8433c16680
--- /dev/null
+++ b/src/caffe/layers/hdf5_output_layer.cpp
@@ -0,0 +1,88 @@
+// Copyright 2014 BVLC and contributors.
+/*
+Contributors:
+- kloudkl@github, 2014.
+*/
+
+#include <vector>
+
+#include "hdf5.h"
+#include "hdf5_hl.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/io.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+using std::vector;
+
+template <typename Dtype>
+HDF5OutputLayer<Dtype>::HDF5OutputLayer(const LayerParameter& param)
+    : Layer<Dtype>(param),
+      file_name_(param.hdf5_output_param().file_name()) {
+  /* create a HDF5 file */
+  file_id_ = H5Fcreate(file_name_.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT,
+                       H5P_DEFAULT);
+  CHECK_GE(file_id_, 0) << "Failed to open HDF5 file" << file_name_;
+}
+
+template <typename Dtype>
+HDF5OutputLayer<Dtype>::~HDF5OutputLayer<Dtype>() {
+  herr_t status = H5Fclose(file_id_);
+  CHECK_GE(status, 0) << "Failed to close HDF5 file " << file_name_;
+}
+
+template <typename Dtype>
+void HDF5OutputLayer<Dtype>::SaveBlobs() {
+  // TODO: no limit on the number of blobs
+  LOG(INFO) << "Saving HDF5 file" << file_name_;
+  CHECK_EQ(data_blob_.num(), label_blob_.num()) <<
+      "data blob and label blob must have the same batch size";
+  hdf5_save_nd_dataset(file_id_, HDF5_DATA_DATASET_NAME, data_blob_);
+  hdf5_save_nd_dataset(file_id_, HDF5_DATA_LABEL_NAME, label_blob_);
+  LOG(INFO) << "Successfully saved " << data_blob_.num() << " rows";
+}
+
+template <typename Dtype>
+void HDF5OutputLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  // TODO: no limit on the number of blobs
+  CHECK_EQ(bottom.size(), 2) << "HDF5OutputLayer takes two blobs as input.";
+  CHECK_EQ(top->size(), 0) << "HDF5OutputLayer takes no output blobs.";
+}
+
+template <typename Dtype>
+Dtype HDF5OutputLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  CHECK_GE(bottom.size(), 2);
+  CHECK_EQ(bottom[0]->num(), bottom[1]->num());
+  data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+                     bottom[0]->height(), bottom[0]->width());
+  label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
+                     bottom[1]->height(), bottom[1]->width());
+  const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
+  const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
+
+  for (int i = 0; i < bottom[0]->num(); ++i) {
+    memcpy(&data_blob_.mutable_cpu_data()[i * data_datum_dim],
+           &bottom[0]->cpu_data()[i * data_datum_dim],
+           sizeof(Dtype) * data_datum_dim);
+    memcpy(&label_blob_.mutable_cpu_data()[i * label_datum_dim],
+           &bottom[1]->cpu_data()[i * label_datum_dim],
+           sizeof(Dtype) * label_datum_dim);
+  }
+  SaveBlobs();
+  return Dtype(0.);
+}
+
+template <typename Dtype>
+void HDF5OutputLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
+  return;
+}
+
+INSTANTIATE_CLASS(HDF5OutputLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/hdf5_output_layer.cu b/src/caffe/layers/hdf5_output_layer.cu
new file mode 100644
index 00000000000..b5d10888653
--- /dev/null
+++ b/src/caffe/layers/hdf5_output_layer.cu
@@ -0,0 +1,53 @@
+// Copyright 2014 BVLC and contributors.
+/*
+Contributors:
+- kloudkl@github, 2014.
+*/
+
+#include <vector>
+
+#include "hdf5.h"
+#include "hdf5_hl.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/io.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+using std::vector;
+
+template <typename Dtype>
+Dtype HDF5OutputLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  CHECK_GE(bottom.size(), 2);
+  CHECK_EQ(bottom[0]->num(), bottom[1]->num());
+  data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+                     bottom[0]->height(), bottom[0]->width());
+  label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
+                     bottom[1]->height(), bottom[1]->width());
+  const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
+  const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
+
+  for (int i = 0; i < bottom[0]->num(); ++i) {
+    CUDA_CHECK(cudaMemcpy(&data_blob_.mutable_cpu_data()[i * data_datum_dim],
+           &bottom[0]->gpu_data()[i * data_datum_dim],
+           sizeof(Dtype) * data_datum_dim, cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpy(&label_blob_.mutable_cpu_data()[i * label_datum_dim],
+           &bottom[1]->gpu_data()[i * label_datum_dim],
+           sizeof(Dtype) * label_datum_dim, cudaMemcpyDeviceToHost));
+  }
+  SaveBlobs();
+  return Dtype(0.);
+}
+
+template <typename Dtype>
+void HDF5OutputLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
+  return;
+}
+
+INSTANTIATE_CLASS(HDF5OutputLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index e711713b895..a01bfb7c21c 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -26,7 +26,7 @@ void Im2colLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = (*top)[0]->mutable_cpu_data();
@@ -34,10 +34,11 @@ void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     im2col_cpu(bottom_data + bottom[0]->offset(n), CHANNELS_, HEIGHT_,
         WIDTH_, KSIZE_, PAD_, STRIDE_, top_data + (*top)[0]->offset(n));
   }
+  return Dtype(0.);
 }
 
 template <typename Dtype>
-Dtype Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
   Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
@@ -45,7 +46,6 @@ Dtype Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     col2im_cpu(top_diff + top[0]->offset(n), CHANNELS_, HEIGHT_,
         WIDTH_, KSIZE_, PAD_, STRIDE_, bottom_diff + (*bottom)[0]->offset(n));
   }
-  return Dtype(0.);
 }
 
 INSTANTIATE_CLASS(Im2colLayer);
diff --git a/src/caffe/layers/im2col_layer.cu b/src/caffe/layers/im2col_layer.cu
index 2d949b12296..64731cc53d8 100644
--- a/src/caffe/layers/im2col_layer.cu
+++ b/src/caffe/layers/im2col_layer.cu
@@ -10,7 +10,7 @@
 namespace caffe {
 
 template <typename Dtype>
-void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = (*top)[0]->mutable_gpu_data();
@@ -18,10 +18,11 @@ void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     im2col_gpu(bottom_data + bottom[0]->offset(n), CHANNELS_, HEIGHT_,
         WIDTH_, KSIZE_, PAD_, STRIDE_, top_data + (*top)[0]->offset(n));
   }
+  return Dtype(0.);
 }
 
 template <typename Dtype>
-Dtype Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   const Dtype* top_diff = top[0]->gpu_diff();
   Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
@@ -29,7 +30,6 @@ Dtype Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     col2im_gpu(top_diff + top[0]->offset(n), CHANNELS_, HEIGHT_,
         WIDTH_, KSIZE_, PAD_, STRIDE_, bottom_diff + (*bottom)[0]->offset(n));
   }
-  return Dtype(0.);
 }
 
 
diff --git a/src/caffe/layers/images_layer.cpp b/src/caffe/layers/images_layer.cpp
index e750e01b266..6208a9e7fa6 100644
--- a/src/caffe/layers/images_layer.cpp
+++ b/src/caffe/layers/images_layer.cpp
@@ -233,7 +233,7 @@ void ImagesLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void ImagesLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype ImagesLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   // First, join the thread
   CHECK(!pthread_join(thread_, NULL)) << "Pthread joining failed.";
@@ -245,10 +245,11 @@ void ImagesLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   // Start a new prefetch thread
   CHECK(!pthread_create(&thread_, NULL, ImagesLayerPrefetch<Dtype>,
       reinterpret_cast<void*>(this))) << "Pthread execution failed.";
+  return Dtype(0.);
 }
 
 template <typename Dtype>
-void ImagesLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype ImagesLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   // First, join the thread
   CHECK(!pthread_join(thread_, NULL)) << "Pthread joining failed.";
@@ -262,18 +263,6 @@ void ImagesLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   // Start a new prefetch thread
   CHECK(!pthread_create(&thread_, NULL, ImagesLayerPrefetch<Dtype>,
       reinterpret_cast<void*>(this))) << "Pthread execution failed.";
-}
-
-// The backward operations are dummy - they do not carry any computation.
-template <typename Dtype>
-Dtype ImagesLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
-  return Dtype(0.);
-}
-
-template <typename Dtype>
-Dtype ImagesLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   return Dtype(0.);
 }
 
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index 6987a787ed3..6ea228fefdd 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -1,8 +1,5 @@
 // Copyright 2013 Yangqing Jia
 
-
-#include <mkl.h>
-
 #include <vector>
 
 #include "caffe/blob.hpp"
@@ -61,7 +58,7 @@ void InnerProductLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void InnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype InnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = (*top)[0]->mutable_cpu_data();
@@ -73,10 +70,11 @@ void InnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
         reinterpret_cast<const Dtype*>(bias_multiplier_->cpu_data()),
         this->blobs_[1]->cpu_data(), (Dtype)1., top_data);
   }
+  return Dtype(0);
 }
 
 template <typename Dtype>
-Dtype InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
@@ -96,7 +94,6 @@ Dtype InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
         top_diff, this->blobs_[0]->cpu_data(), (Dtype)0.,
         (*bottom)[0]->mutable_cpu_diff());
   }
-  return Dtype(0);
 }
 
 INSTANTIATE_CLASS(InnerProductLayer);
diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/inner_product_layer.cu
index c7c3e2a99fd..37463b5a971 100644
--- a/src/caffe/layers/inner_product_layer.cu
+++ b/src/caffe/layers/inner_product_layer.cu
@@ -1,7 +1,5 @@
 // Copyright 2013 Yangqing Jia
 
-
-#include <mkl.h>
 #include <cublas_v2.h>
 
 #include <vector>
@@ -16,7 +14,7 @@
 namespace caffe {
 
 template <typename Dtype>
-void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = (*top)[0]->mutable_gpu_data();
@@ -28,10 +26,11 @@ void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
         reinterpret_cast<const Dtype*>(bias_multiplier_->gpu_data()),
         this->blobs_[1]->gpu_data(), (Dtype)1., top_data);
   }
+  return Dtype(0);
 }
 
 template <typename Dtype>
-Dtype InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   const Dtype* top_diff = top[0]->gpu_diff();
@@ -51,7 +50,6 @@ Dtype InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         top_diff, this->blobs_[0]->gpu_data(), (Dtype)0.,
         (*bottom)[0]->mutable_gpu_diff());
   }
-  return Dtype(0);
 }
 
 INSTANTIATE_CLASS(InnerProductLayer);
diff --git a/src/caffe/layers/loss_layer.cpp b/src/caffe/layers/loss_layer.cpp
index 1c4303d9bd4..ef0074d5454 100644
--- a/src/caffe/layers/loss_layer.cpp
+++ b/src/caffe/layers/loss_layer.cpp
@@ -28,9 +28,24 @@ void MultinomialLogisticLossLayer<Dtype>::SetUp(
   CHECK_EQ(bottom[1]->width(), 1);
 }
 
+template <typename Dtype>
+Dtype MultinomialLogisticLossLayer<Dtype>::Forward_cpu(
+    const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const Dtype* bottom_label = bottom[1]->cpu_data();
+  int num = bottom[0]->num();
+  int dim = bottom[0]->count() / bottom[0]->num();
+  Dtype loss = 0;
+  for (int i = 0; i < num; ++i) {
+    int label = static_cast<int>(bottom_label[i]);
+    Dtype prob = max(bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD));
+    loss -= log(prob);
+  }
+  return loss / num;
+}
 
 template <typename Dtype>
-Dtype MultinomialLogisticLossLayer<Dtype>::Backward_cpu(
+void MultinomialLogisticLossLayer<Dtype>::Backward_cpu(
     const vector<Blob<Dtype>*>& top, const bool propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   const Dtype* bottom_data = (*bottom)[0]->cpu_data();
@@ -39,18 +54,13 @@ Dtype MultinomialLogisticLossLayer<Dtype>::Backward_cpu(
   int num = (*bottom)[0]->num();
   int dim = (*bottom)[0]->count() / (*bottom)[0]->num();
   memset(bottom_diff, 0, sizeof(Dtype) * (*bottom)[0]->count());
-  Dtype loss = 0;
   for (int i = 0; i < num; ++i) {
     int label = static_cast<int>(bottom_label[i]);
     Dtype prob = max(bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD));
-    loss -= log(prob);
-    bottom_diff[i * dim + label] = - 1. / prob / num;
+    bottom_diff[i * dim + label] = -1. / prob / num;
   }
-  return loss / num;
 }
 
-// TODO: implement the GPU version for multinomial loss
-
 
 template <typename Dtype>
 void InfogainLossLayer<Dtype>::SetUp(
@@ -72,7 +82,27 @@ void InfogainLossLayer<Dtype>::SetUp(
 
 
 template <typename Dtype>
-Dtype InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+Dtype InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const Dtype* bottom_label = bottom[1]->cpu_data();
+  const Dtype* infogain_mat = infogain_.cpu_data();
+  int num = bottom[0]->num();
+  int dim = bottom[0]->count() / bottom[0]->num();
+  CHECK_EQ(infogain_.height(), dim);
+  Dtype loss = 0;
+  for (int i = 0; i < num; ++i) {
+    int label = static_cast<int>(bottom_label[i]);
+    for (int j = 0; j < dim; ++j) {
+      Dtype prob = max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
+      loss -= infogain_mat[label * dim + j] * log(prob);
+    }
+  }
+  return loss / num;
+}
+
+template <typename Dtype>
+void InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   const Dtype* bottom_data = (*bottom)[0]->cpu_data();
@@ -82,16 +112,13 @@ Dtype InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   int num = (*bottom)[0]->num();
   int dim = (*bottom)[0]->count() / (*bottom)[0]->num();
   CHECK_EQ(infogain_.height(), dim);
-  Dtype loss = 0;
   for (int i = 0; i < num; ++i) {
     int label = static_cast<int>(bottom_label[i]);
     for (int j = 0; j < dim; ++j) {
       Dtype prob = max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
-      loss -= infogain_mat[label * dim + j] * log(prob);
       bottom_diff[i * dim + j] = - infogain_mat[label * dim + j] / prob / num;
     }
   }
-  return loss / num;
 }
 
 
@@ -110,18 +137,25 @@ void EuclideanLossLayer<Dtype>::SetUp(
 }
 
 template <typename Dtype>
-Dtype EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
-  int count = (*bottom)[0]->count();
-  int num = (*bottom)[0]->num();
-  caffe_sub(count, (*bottom)[0]->cpu_data(), (*bottom)[1]->cpu_data(),
+Dtype EuclideanLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  int count = bottom[0]->count();
+  int num = bottom[0]->num();
+  caffe_sub(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(),
       difference_.mutable_cpu_data());
   Dtype loss = caffe_cpu_dot(
       count, difference_.cpu_data(), difference_.cpu_data()) / num / Dtype(2);
+  return loss;
+}
+
+template <typename Dtype>
+void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
+  int count = (*bottom)[0]->count();
+  int num = (*bottom)[0]->num();
   // Compute the gradient
-  caffe_axpby(count, Dtype(1) / num, difference_.cpu_data(), Dtype(0),
+  caffe_cpu_axpby(count, Dtype(1) / num, difference_.cpu_data(), Dtype(0),
       (*bottom)[0]->mutable_cpu_diff());
-  return loss;
 }
 
 template <typename Dtype>
@@ -138,7 +172,7 @@ void AccuracyLayer<Dtype>::SetUp(
 }
 
 template <typename Dtype>
-void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   Dtype accuracy = 0;
   Dtype logprob = 0;
@@ -166,6 +200,8 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   // LOG(INFO) << "Accuracy: " << accuracy;
   (*top)[0]->mutable_cpu_data()[0] = accuracy / num;
   (*top)[0]->mutable_cpu_data()[1] = logprob / num;
+  // Accuracy layer should not be used as a loss function.
+  return Dtype(0);
 }
 
 INSTANTIATE_CLASS(MultinomialLogisticLossLayer);
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index 36dbe41ea8c..698debab6a6 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -28,7 +28,7 @@ void LRNLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void LRNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype LRNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = (*top)[0]->mutable_cpu_data();
@@ -72,10 +72,12 @@ void LRNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   // In the end, compute output
   caffe_powx<Dtype>(scale_.count(), scale_data, -beta_, top_data);
   caffe_mul<Dtype>(scale_.count(), top_data, bottom_data, top_data);
+
+  return Dtype(0.);
 }
 
 template <typename Dtype>
-Dtype LRNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void LRNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
   const Dtype* top_data = top[0]->cpu_data();
@@ -126,7 +128,6 @@ Dtype LRNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
           padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
     }
   }
-  return Dtype(0.);
 }
 
 INSTANTIATE_CLASS(LRNLayer);
diff --git a/src/caffe/layers/lrn_layer.cu b/src/caffe/layers/lrn_layer.cu
index 028aa8fa47e..1dcd0c087c0 100644
--- a/src/caffe/layers/lrn_layer.cu
+++ b/src/caffe/layers/lrn_layer.cu
@@ -65,7 +65,7 @@ __global__ void LRNComputeOutput(const int nthreads, const Dtype* in,
 }
 
 template <typename Dtype>
-void LRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype LRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   // First, compute scale
   const Dtype* bottom_data = bottom[0]->gpu_data();
@@ -84,6 +84,7 @@ void LRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   LRNComputeOutput<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
       n_threads, bottom_data, scale_data, -beta_, top_data);
   CUDA_POST_KERNEL_CHECK;
+  return Dtype(0.);
 }
 
 
@@ -149,7 +150,7 @@ __global__ void LRNComputeDiff(const int nthreads, const Dtype* bottom_data,
 }
 
 template <typename Dtype>
-Dtype LRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void LRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   int n_threads = num_ * height_ * width_;
   // NOLINT_NEXT_LINE(whitespace/operators)
@@ -158,7 +159,6 @@ Dtype LRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
       size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
       (*bottom)[0]->mutable_gpu_diff());
-  return Dtype(0.);
 }
 
 
diff --git a/src/caffe/layers/padding_layer.cpp b/src/caffe/layers/padding_layer.cpp
index 4cb67df0dcf..658cc6ab16c 100644
--- a/src/caffe/layers/padding_layer.cpp
+++ b/src/caffe/layers/padding_layer.cpp
@@ -29,7 +29,7 @@ void PaddingLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void PaddingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype PaddingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   Dtype* top_data = (*top)[0]->mutable_cpu_data();
   const Dtype* bottom_data = bottom[0]->cpu_data();
@@ -47,10 +47,11 @@ void PaddingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       }
     }
   }
+  return Dtype(0.);
 }
 
 template <typename Dtype>
-Dtype PaddingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void PaddingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
   Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
@@ -66,7 +67,6 @@ Dtype PaddingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       }
     }
   }
-  return Dtype(0.);
 }
 
 INSTANTIATE_CLASS(PaddingLayer);
diff --git a/src/caffe/layers/padding_layer.cu b/src/caffe/layers/padding_layer.cu
index 7ec28a9e30f..d476df501fd 100644
--- a/src/caffe/layers/padding_layer.cu
+++ b/src/caffe/layers/padding_layer.cu
@@ -27,7 +27,7 @@ __global__ void PaddingForward(const int count, const Dtype* in, Dtype* out,
 }
 
 template <typename Dtype>
-void PaddingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype PaddingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = (*top)[0]->mutable_gpu_data();
@@ -39,6 +39,7 @@ void PaddingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       count, bottom_data, top_data, NUM_, CHANNEL_, HEIGHT_IN_, WIDTH_IN_,
       PAD_);
   CUDA_POST_KERNEL_CHECK;
+  return Dtype(0);
 }
 
 template <typename Dtype>
@@ -61,7 +62,7 @@ __global__ void PaddingBackward(const int count, const Dtype* in, Dtype* out,
 }
 
 template <typename Dtype>
-Dtype PaddingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void PaddingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   if (propagate_down) {
@@ -74,7 +75,6 @@ Dtype PaddingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         PAD_);
     CUDA_POST_KERNEL_CHECK;
   }
-  return Dtype(0);
 }
 
 INSTANTIATE_CLASS(PaddingLayer);
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index ce30e842c58..3fd421cd640 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -39,7 +39,7 @@ void PoolingLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 // TODO(Yangqing): Is there a faster way to do pooling in the channel-first
 // case?
 template <typename Dtype>
-void PoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype PoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = (*top)[0]->mutable_cpu_data();
@@ -111,13 +111,14 @@ void PoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   default:
     LOG(FATAL) << "Unknown pooling method.";
   }
+  return Dtype(0.);
 }
 
 template <typename Dtype>
-Dtype PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   if (!propagate_down) {
-    return Dtype(0.);
+    return;
   }
   const Dtype* top_diff = top[0]->cpu_diff();
   const Dtype* top_data = top[0]->cpu_data();
@@ -188,7 +189,6 @@ Dtype PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   default:
     LOG(FATAL) << "Unknown pooling method.";
   }
-  return Dtype(0.);
 }
 
 
diff --git a/src/caffe/layers/pooling_layer.cu b/src/caffe/layers/pooling_layer.cu
index 357a392976d..63b4d0dbad7 100644
--- a/src/caffe/layers/pooling_layer.cu
+++ b/src/caffe/layers/pooling_layer.cu
@@ -135,7 +135,7 @@ __global__ void StoPoolForwardTest(const int nthreads,
 
 
 template <typename Dtype>
-void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = (*top)[0]->mutable_gpu_data();
@@ -179,6 +179,7 @@ void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     LOG(FATAL) << "Unknown pooling method.";
   }
   CUDA_POST_KERNEL_CHECK;
+  return Dtype(0.);
 }
 
 template <typename Dtype>
@@ -277,10 +278,10 @@ __global__ void StoPoolBackward(const int nthreads,
 
 
 template <typename Dtype>
-Dtype PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   if (!propagate_down) {
-    return Dtype(0.);
+    return;
   }
   const Dtype* top_diff = top[0]->gpu_diff();
   Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
@@ -311,7 +312,6 @@ Dtype PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     LOG(FATAL) << "Unknown pooling method.";
   }
   CUDA_POST_KERNEL_CHECK;
-  return Dtype(0.);
 }
 
 
diff --git a/src/caffe/layers/regularizer_as_loss_layer.cpp b/src/caffe/layers/regularizer_as_loss_layer.cpp
new file mode 100644
index 00000000000..684725e2312
--- /dev/null
+++ b/src/caffe/layers/regularizer_as_loss_layer.cpp
@@ -0,0 +1,64 @@
+// Copyright 2014 kloudkl@github
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+using std::vector;
+
+template<typename Dtype>
+RegularizerAsLossLayer<Dtype>::RegularizerAsLossLayer(
+    const LayerParameter& param)
+    : Layer<Dtype>(param),
+      num_regularizers_(param.regularizer_size()) {
+  if (num_regularizers_ > 0) {
+    regularizers_.resize(num_regularizers_);
+    for (int i = 0; i < num_regularizers_; ++i) {
+      regularizers_[i].reset(GetRegularizer<Dtype>(param.regularizer(i)));
+    }
+  }
+}
+
+template<typename Dtype>
+void RegularizerAsLossLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
+                                          vector<Blob<Dtype>*>* top) {
+  CHECK_EQ(bottom.size(), 1)<<
+      "RegularizerAsLossLayer takes one blob as input.";
+  CHECK_EQ(top->size(), 0) <<
+      "RegularizerAsLossLayer takes no blob as output.";
+}
+
+template<typename Dtype>
+Dtype RegularizerAsLossLayer<Dtype>::Forward_cpu(
+    const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
+  Blob<Dtype>* bottom_data = bottom[0];
+  if (bottom_data->count() > 0) {
+    memset(bottom_data->mutable_cpu_diff(), 0,
+           bottom_data->count() * sizeof(Dtype));
+    Dtype loss = 0;
+    for (int i = 0; i < num_regularizers_; ++i) {
+      loss += regularizers_[i]->Regularize_cpu(bottom_data);
+    }
+    int num = bottom_data->num();
+    // Scale down gradient
+    caffe_scal<Dtype>(bottom_data->count(), Dtype(1) / num,
+                      bottom_data->mutable_cpu_diff());
+    return loss / num;
+  }
+  return Dtype(0);
+}
+
+template<typename Dtype>
+void RegularizerAsLossLayer<Dtype>::Backward_cpu(
+    const vector<Blob<Dtype>*>& top, const bool propagate_down,
+    vector<Blob<Dtype>*>* bottom) {
+  return;
+}
+
+INSTANTIATE_CLASS(RegularizerAsLossLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/regularizer_as_loss_layer.cu b/src/caffe/layers/regularizer_as_loss_layer.cu
new file mode 100644
index 00000000000..45fd68eb7fd
--- /dev/null
+++ b/src/caffe/layers/regularizer_as_loss_layer.cu
@@ -0,0 +1,43 @@
+// Copyright 2014 kloudkl@github
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+using std::vector;
+
+template<typename Dtype>
+Dtype RegularizerAsLossLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
+  Blob<Dtype>* bottom_data = bottom[0];
+  if (bottom_data->count() > 0) {
+    CUDA_CHECK(
+        cudaMemset(bottom_data->mutable_gpu_diff(), 0,
+                   bottom_data->count() * sizeof(Dtype)));
+    Dtype loss = 0;
+    for (int i = 0; i < num_regularizers_; ++i) {
+      loss += regularizers_[i]->Regularize_gpu(bottom_data);
+    }
+    int num = bottom_data->num();
+    // Scale down gradient
+    caffe_gpu_scal<Dtype>(bottom_data->count(), Dtype(1) / num,
+                          bottom_data->mutable_gpu_diff());
+    return loss / num;
+  }
+  return Dtype(0);
+}
+
+template<typename Dtype>
+void RegularizerAsLossLayer<Dtype>::Backward_gpu(
+    const vector<Blob<Dtype>*>& top, const bool propagate_down,
+    vector<Blob<Dtype>*>* bottom) {
+  return;
+}
+
+INSTANTIATE_CLASS(RegularizerAsLossLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
index 27ae94b7cb0..18c675c98c7 100644
--- a/src/caffe/layers/relu_layer.cpp
+++ b/src/caffe/layers/relu_layer.cpp
@@ -11,7 +11,7 @@ using std::max;
 namespace caffe {
 
 template <typename Dtype>
-void ReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype ReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = (*top)[0]->mutable_cpu_data();
@@ -19,10 +19,11 @@ void ReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   for (int i = 0; i < count; ++i) {
     top_data[i] = max(bottom_data[i], Dtype(0));
   }
+  return Dtype(0);
 }
 
 template <typename Dtype>
-Dtype ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   if (propagate_down) {
@@ -34,7 +35,6 @@ Dtype ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       bottom_diff[i] = top_diff[i] * (bottom_data[i] > 0);
     }
   }
-  return Dtype(0);
 }
 
 
diff --git a/src/caffe/layers/relu_layer.cu b/src/caffe/layers/relu_layer.cu
index 20a5a45e2f4..27f5da5cc89 100644
--- a/src/caffe/layers/relu_layer.cu
+++ b/src/caffe/layers/relu_layer.cu
@@ -18,7 +18,7 @@ __global__ void ReLUForward(const int n, const Dtype* in, Dtype* out) {
 }
 
 template <typename Dtype>
-void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = (*top)[0]->mutable_gpu_data();
@@ -32,6 +32,7 @@ void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   //     << " top_data: " << (unsigned long)top_data
   //     << " blocks: " << CAFFE_GET_BLOCKS(count)
   //     << " threads: " << CAFFE_CUDA_NUM_THREADS;
+  return Dtype(0);
 }
 
 template <typename Dtype>
@@ -43,7 +44,7 @@ __global__ void ReLUBackward(const int n, const Dtype* in_diff,
 }
 
 template <typename Dtype>
-Dtype ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   if (propagate_down) {
@@ -56,7 +57,6 @@ Dtype ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         count, top_diff, bottom_data, bottom_diff);
     CUDA_POST_KERNEL_CHECK;
   }
-  return Dtype(0);
 }
 
 INSTANTIATE_CLASS(ReLULayer);
diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp
index ba6ec84e717..44897954677 100644
--- a/src/caffe/layers/sigmoid_layer.cpp
+++ b/src/caffe/layers/sigmoid_layer.cpp
@@ -15,7 +15,7 @@ inline Dtype sigmoid(Dtype x) {
 }
 
 template <typename Dtype>
-void SigmoidLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype SigmoidLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = (*top)[0]->mutable_cpu_data();
@@ -23,10 +23,11 @@ void SigmoidLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   for (int i = 0; i < count; ++i) {
     top_data[i] = sigmoid(bottom_data[i]);
   }
+  return Dtype(0);
 }
 
 template <typename Dtype>
-Dtype SigmoidLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void SigmoidLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   if (propagate_down) {
@@ -39,7 +40,6 @@ Dtype SigmoidLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       bottom_diff[i] = top_diff[i] * sigmoid_x * (1. - sigmoid_x);
     }
   }
-  return Dtype(0);
 }
 
 INSTANTIATE_CLASS(SigmoidLayer);
diff --git a/src/caffe/layers/sigmoid_layer.cu b/src/caffe/layers/sigmoid_layer.cu
index ba311f814a3..3dbdc397bee 100644
--- a/src/caffe/layers/sigmoid_layer.cu
+++ b/src/caffe/layers/sigmoid_layer.cu
@@ -24,7 +24,7 @@ __global__ void SigmoidForward(const int n, const Dtype* in, Dtype* out) {
 }
 
 template <typename Dtype>
-void SigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype SigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = (*top)[0]->mutable_gpu_data();
@@ -38,6 +38,7 @@ void SigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   //     << " top_data: " << (unsigned long)top_data
   //     << " blocks: " << CAFFE_GET_BLOCKS(count)
   //     << " threads: " << CAFFE_CUDA_NUM_THREADS;
+  return Dtype(0);
 }
 
 template <typename Dtype>
@@ -50,7 +51,7 @@ __global__ void SigmoidBackward(const int n, const Dtype* in_diff,
 }
 
 template <typename Dtype>
-Dtype SigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void SigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   if (propagate_down) {
@@ -63,7 +64,6 @@ Dtype SigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         count, top_diff, bottom_data, bottom_diff);
     CUDA_POST_KERNEL_CHECK;
   }
-  return Dtype(0);
 }
 
 INSTANTIATE_CLASS(SigmoidLayer);
diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp
index 69e95ff6385..0d2e4572c76 100644
--- a/src/caffe/layers/softmax_layer.cpp
+++ b/src/caffe/layers/softmax_layer.cpp
@@ -28,7 +28,7 @@ void SoftmaxLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void SoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype SoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = (*top)[0]->mutable_cpu_data();
@@ -56,10 +56,11 @@ void SoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   for (int i = 0; i < num; ++i) {
     caffe_scal<Dtype>(dim, Dtype(1.) / scale_data[i], top_data + i * dim);
   }
+  return Dtype(0);
 }
 
 template <typename Dtype>
-Dtype SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
@@ -79,7 +80,6 @@ Dtype SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       scale_data, sum_multiplier_.cpu_data(), 1., bottom_diff);
   // elementwise multiplication
   caffe_mul<Dtype>(top[0]->count(), bottom_diff, top_data, bottom_diff);
-  return Dtype(0);
 }
 
 
diff --git a/src/caffe/layers/softmax_layer.cu b/src/caffe/layers/softmax_layer.cu
index 2e41a1794df..5efa4909263 100644
--- a/src/caffe/layers/softmax_layer.cu
+++ b/src/caffe/layers/softmax_layer.cu
@@ -43,7 +43,7 @@ __global__ void kernel_exp(const int num, const Dtype* data, Dtype* out) {
 }
 
 template <typename Dtype>
-void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = (*top)[0]->mutable_gpu_data();
@@ -73,11 +73,12 @@ void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   kernel_softmax_div<Dtype><<<CAFFE_GET_BLOCKS(num * dim),
                               CAFFE_CUDA_NUM_THREADS>>>(
       num, dim, scale_data, top_data);
+  return Dtype(0);
 }
 
 // TODO(Yangqing): implement the GPU version of softmax.
 template <typename Dtype>
-Dtype SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   const Dtype* top_diff = top[0]->gpu_diff();
   const Dtype* top_data = top[0]->gpu_data();
@@ -103,7 +104,6 @@ Dtype SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       scale_.gpu_data(), sum_multiplier_.gpu_data(), 1., bottom_diff);
   // elementwise multiplication
   caffe_gpu_mul<Dtype>(top[0]->count(), bottom_diff, top_data, bottom_diff);
-  return Dtype(0);
 }
 
 INSTANTIATE_CLASS(SoftmaxLayer);
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index 6fdaea5a1dd..f9bd82e217a 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -24,33 +24,39 @@ void SoftmaxWithLossLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
+Dtype SoftmaxWithLossLayer<Dtype>::Forward_cpu(
     const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
   // The forward pass computes the softmax prob values.
   softmax_bottom_vec_[0] = bottom[0];
   softmax_layer_->Forward(softmax_bottom_vec_, &softmax_top_vec_);
+  const Dtype* prob_data = prob_.cpu_data();
+  const Dtype* label = bottom[1]->cpu_data();
+  int num = prob_.num();
+  int dim = prob_.count() / num;
+  Dtype loss = 0;
+  for (int i = 0; i < num; ++i) {
+    loss += -log(max(prob_data[i * dim + static_cast<int>(label[i])],
+                     Dtype(FLT_MIN)));
+  }
+  return loss / num;
 }
 
 template <typename Dtype>
-Dtype SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down,
     vector<Blob<Dtype>*>* bottom) {
-  // First, compute the diff
+  // Compute the diff
   Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
   const Dtype* prob_data = prob_.cpu_data();
   memcpy(bottom_diff, prob_data, sizeof(Dtype) * prob_.count());
   const Dtype* label = (*bottom)[1]->cpu_data();
   int num = prob_.num();
   int dim = prob_.count() / num;
-  Dtype loss = 0;
   for (int i = 0; i < num; ++i) {
     bottom_diff[i * dim + static_cast<int>(label[i])] -= 1;
-    loss += -log(max(prob_data[i * dim + static_cast<int>(label[i])],
-                     Dtype(FLT_MIN)));
   }
   // Scale down gradient
   caffe_scal(prob_.count(), Dtype(1) / num, bottom_diff);
-  return loss / num;
 }
 
 
diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/softmax_loss_layer.cu
index 100393caa3d..ab7ee6ee3bb 100644
--- a/src/caffe/layers/softmax_loss_layer.cu
+++ b/src/caffe/layers/softmax_loss_layer.cu
@@ -13,18 +13,17 @@ using std::max;
 namespace caffe {
 
 template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
+Dtype SoftmaxWithLossLayer<Dtype>::Forward_gpu(
     const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
   // The forward pass computes the softmax prob values.
-  softmax_bottom_vec_[0] = bottom[0];
-  softmax_layer_->Forward(softmax_bottom_vec_, &softmax_top_vec_);
+  return Forward_cpu(bottom, top);
 }
 
 template <typename Dtype>
-Dtype SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   // TODO(Yangqing): implement the GPU version of softmax.
-  return Backward_cpu(top, propagate_down, bottom);
+  Backward_cpu(top, propagate_down, bottom);
 }
 
 INSTANTIATE_CLASS(SoftmaxWithLossLayer);
diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp
index f9fc461a11f..a8a240f74a6 100644
--- a/src/caffe/layers/split_layer.cpp
+++ b/src/caffe/layers/split_layer.cpp
@@ -28,7 +28,7 @@ void SplitLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void SplitLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype SplitLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   for (int i = 0; i < top->size(); ++i) {
@@ -38,10 +38,11 @@ void SplitLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     Dtype* top_data = (*top)[i]->mutable_cpu_data();
     caffe_copy(count_, bottom_data, top_data);
   }
+  return Dtype(0.);
 }
 
 template <typename Dtype>
-Dtype SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   if (propagate_down) {
     const Dtype* top_diff = top[0]->cpu_diff();
@@ -58,7 +59,6 @@ Dtype SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       caffe_axpy(count_, Dtype(1.), top_diff, bottom_diff);
     }
   }
-  return Dtype(0.);
 }
 
 
diff --git a/src/caffe/layers/split_layer.cu b/src/caffe/layers/split_layer.cu
index 5f25a460a6a..deccf990a27 100644
--- a/src/caffe/layers/split_layer.cu
+++ b/src/caffe/layers/split_layer.cu
@@ -9,7 +9,7 @@
 namespace caffe {
 
 template <typename Dtype>
-void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   for (int i = 0; i < top->size(); ++i) {
@@ -19,10 +19,11 @@ void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     Dtype* top_data = (*top)[i]->mutable_gpu_data();
     caffe_gpu_copy(count_, bottom_data, top_data);
   }
+  return Dtype(0.);
 }
 
 template <typename Dtype>
-Dtype SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   if (propagate_down) {
     const Dtype* top_diff = top[0]->gpu_diff();
@@ -39,7 +40,6 @@ Dtype SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff);
     }
   }
-  return Dtype(0.);
 }
 
 
diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp
index d6f99560082..c26579234bc 100644
--- a/src/caffe/layers/tanh_layer.cpp
+++ b/src/caffe/layers/tanh_layer.cpp
@@ -11,7 +11,7 @@
 namespace caffe {
 
 template <typename Dtype>
-void TanHLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype TanHLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = (*top)[0]->mutable_cpu_data();
@@ -21,10 +21,11 @@ void TanHLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     exp2x = exp(2*bottom_data[i]);
     top_data[i] = (exp2x - Dtype(1))/(exp2x + Dtype(1));
   }
+  return Dtype(0);
 }
 
 template <typename Dtype>
-Dtype TanHLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void TanHLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   if (propagate_down) {
@@ -40,7 +41,6 @@ Dtype TanHLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       bottom_diff[i] = top_diff[i] * (1 - tanhx*tanhx);
     }
   }
-  return Dtype(0);
 }
 
 INSTANTIATE_CLASS(TanHLayer);
diff --git a/src/caffe/layers/tanh_layer.cu b/src/caffe/layers/tanh_layer.cu
index c1f8a29cc5c..899b841b069 100644
--- a/src/caffe/layers/tanh_layer.cu
+++ b/src/caffe/layers/tanh_layer.cu
@@ -19,7 +19,7 @@ __global__ void TanHForward(const int n, const Dtype* in, Dtype* out) {
 }
 
 template <typename Dtype>
-void TanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype TanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = (*top)[0]->mutable_gpu_data();
@@ -33,6 +33,7 @@ void TanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   //     << " top_data: " << (unsigned long)top_data
   //     << " blocks: " << CAFFE_GET_BLOCKS(count)
   //     << " threads: " << CAFFE_CUDA_NUM_THREADS;
+  return Dtype(0);
 }
 
 template <typename Dtype>
@@ -46,7 +47,7 @@ __global__ void TanHBackward(const int n, const Dtype* in_diff,
 }
 
 template <typename Dtype>
-Dtype TanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void TanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   if (propagate_down) {
@@ -59,7 +60,6 @@ Dtype TanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         count, top_diff, bottom_data, bottom_diff);
     CUDA_POST_KERNEL_CHECK;
   }
-  return Dtype(0);
 }
 
 INSTANTIATE_CLASS(TanHLayer);
diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp
index 87fb54112f1..a288403f284 100644
--- a/src/caffe/layers/window_data_layer.cpp
+++ b/src/caffe/layers/window_data_layer.cpp
@@ -403,7 +403,7 @@ void WindowDataLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void WindowDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype WindowDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   // First, join the thread
   CHECK(!pthread_join(thread_, NULL)) << "Pthread joining failed.";
@@ -415,10 +415,11 @@ void WindowDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   // Start a new prefetch thread
   CHECK(!pthread_create(&thread_, NULL, WindowDataLayerPrefetch<Dtype>,
       reinterpret_cast<void*>(this))) << "Pthread execution failed.";
+  return Dtype(0.);
 }
 
 template <typename Dtype>
-void WindowDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+Dtype WindowDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   // First, join the thread
   CHECK(!pthread_join(thread_, NULL)) << "Pthread joining failed.";
@@ -432,18 +433,6 @@ void WindowDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   // Start a new prefetch thread
   CHECK(!pthread_create(&thread_, NULL, WindowDataLayerPrefetch<Dtype>,
       reinterpret_cast<void*>(this))) << "Pthread execution failed.";
-}
-
-// The backward operations are dummy - they do not carry any computation.
-template <typename Dtype>
-Dtype WindowDataLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
-  return Dtype(0.);
-}
-
-template <typename Dtype>
-Dtype WindowDataLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   return Dtype(0.);
 }
 
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 1837b0768ae..3018285cd76 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -162,6 +162,12 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
     LOG(INFO) << "This network produces output " << *it;
     net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get());
   }
+  for (size_t i = 0; i < blob_names_.size(); ++i) {
+    blob_names_index_[blob_names_[i]] = i;
+  }
+  for (size_t i = 0; i < layer_names_.size(); ++i) {
+    layer_names_index_[layer_names_[i]] = i;
+  }
   GetLearningRateAndWeightDecay();
   LOG(INFO) << "Network initialization done.";
   LOG(INFO) << "Memory required for Data " << memory_used*sizeof(Dtype);
@@ -207,27 +213,32 @@ void Net<Dtype>::GetLearningRateAndWeightDecay() {
 }
 
 template <typename Dtype>
-const vector<Blob<Dtype>*>& Net<Dtype>::ForwardPrefilled() {
+const vector<Blob<Dtype>*>& Net<Dtype>::ForwardPrefilled(Dtype* loss) {
+  if (loss != NULL) {
+    *loss = Dtype(0.);
+  }
   for (int i = 0; i < layers_.size(); ++i) {
     // LOG(ERROR) << "Forwarding " << layer_names_[i];
-    layers_[i]->Forward(bottom_vecs_[i], &top_vecs_[i]);
+    Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], &top_vecs_[i]);
+    if (loss != NULL) {
+      *loss += layer_loss;
+    }
   }
   return net_output_blobs_;
 }
 
 template <typename Dtype>
 const vector<Blob<Dtype>*>& Net<Dtype>::Forward(
-    const vector<Blob<Dtype>*> & bottom) {
+    const vector<Blob<Dtype>*> & bottom, Dtype* loss) {
   // Copy bottom to internal bottom
   for (int i = 0; i < bottom.size(); ++i) {
     net_input_blobs_[i]->CopyFrom(*bottom[i]);
   }
-  return ForwardPrefilled();
+  return ForwardPrefilled(loss);
 }
 
-
 template <typename Dtype>
-string Net<Dtype>::Forward(const string& input_blob_protos) {
+string Net<Dtype>::Forward(const string& input_blob_protos, Dtype* loss) {
   BlobProtoVector blob_proto_vec;
   if (net_input_blobs_.size()) {
     blob_proto_vec.ParseFromString(input_blob_protos);
@@ -237,7 +248,7 @@ string Net<Dtype>::Forward(const string& input_blob_protos) {
       net_input_blobs_[i]->FromProto(blob_proto_vec.blobs(i));
     }
   }
-  ForwardPrefilled();
+  ForwardPrefilled(loss);
   blob_proto_vec.Clear();
   for (int i = 0; i < net_output_blobs_.size(); ++i) {
     net_output_blobs_[i]->ToProto(blob_proto_vec.add_blobs());
@@ -249,16 +260,12 @@ string Net<Dtype>::Forward(const string& input_blob_protos) {
 
 
 template <typename Dtype>
-Dtype Net<Dtype>::Backward() {
-  Dtype loss = 0;
+void Net<Dtype>::Backward() {
   for (int i = layers_.size() - 1; i >= 0; --i) {
     if (layer_need_backward_[i]) {
-      Dtype layer_loss = layers_[i]->Backward(
-          top_vecs_[i], true, &bottom_vecs_[i]);
-      loss += layer_loss;
+      layers_[i]->Backward(top_vecs_[i], true, &bottom_vecs_[i]);
     }
   }
-  return loss;
 }
 
 template <typename Dtype>
@@ -327,6 +334,42 @@ void Net<Dtype>::Update() {
   }
 }
 
+template <typename Dtype>
+bool Net<Dtype>::has_blob(const string& blob_name) {
+  return blob_names_index_.find(blob_name) != blob_names_index_.end();
+}
+
+template <typename Dtype>
+const shared_ptr<Blob<Dtype> > Net<Dtype>::blob_by_name(
+    const string& blob_name) {
+  shared_ptr<Blob<Dtype> > blob_ptr;
+  if (has_blob(blob_name)) {
+    blob_ptr = blobs_[blob_names_index_[blob_name]];
+  } else {
+    blob_ptr.reset((Blob<Dtype>*)(NULL));
+    LOG(WARNING) << "Unknown blob name " << blob_name;
+  }
+  return blob_ptr;
+}
+
+template <typename Dtype>
+bool Net<Dtype>::has_layer(const string& layer_name) {
+  return layer_names_index_.find(layer_name) != layer_names_index_.end();
+}
+
+template <typename Dtype>
+const shared_ptr<Layer<Dtype> > Net<Dtype>::layer_by_name(
+    const string& layer_name) {
+  shared_ptr<Layer<Dtype> > layer_ptr;
+  if (has_layer(layer_name)) {
+    layer_ptr = layers_[layer_names_index_[layer_name]];
+  } else {
+    layer_ptr.reset((Layer<Dtype>*)(NULL));
+    LOG(WARNING) << "Unknown layer name " << layer_name;
+  }
+  return layer_ptr;
+}
+
 INSTANTIATE_CLASS(Net);
 
 }  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 5a73a4496e0..b7a2142edea 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -38,6 +38,18 @@ message FillerParameter {
   optional float std = 6 [default = 1]; // the std value in gaussian filler
 }
 
+message RegularizerParameter {
+// Weight regularizer type
+  enum RegularizerType {
+    L1 = 0;
+    L2 = 1;
+    MAX_NORM = 2; // not fully implemented yet
+  }
+  optional RegularizerType type = 1;
+  // Coefficent controls how strong to regularize
+  optional float coeff = 2 [default = 0];
+}
+
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the string to specify the layer type
@@ -125,6 +137,14 @@ message LayerParameter {
   // the other dimensions must be the same for all the bottom blobs.
   // By default it will concatenate blobs along the channels dimension.
   optional uint32 concat_dim = 65 [default = 1];
+  
+  optional HDF5OutputParameter hdf5_output_param = 1001;
+
+  repeated RegularizerParameter regularizer = 2001;
+}
+
+message HDF5OutputParameter {
+  optional string file_name = 1;
 }
 
 message LayerConnection {
diff --git a/src/caffe/regularizer.cpp b/src/caffe/regularizer.cpp
new file mode 100644
index 00000000000..704a36b9c54
--- /dev/null
+++ b/src/caffe/regularizer.cpp
@@ -0,0 +1,91 @@
+// Copyright 2014 kloudkl@github
+
+#include <cmath>  // for std::abs
+
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/regularizer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template<typename Dtype>
+Dtype Regularizer<Dtype>::Regularize(Blob<Dtype>* bottom) {
+  Dtype penalty = 0;
+  if (Caffe::mode() == Caffe::CPU) {
+    penalty = Regularize_cpu(bottom);
+  } else if (Caffe::mode() == Caffe::GPU) {
+    penalty = Regularize_gpu(bottom);
+  } else {
+    LOG(FATAL)<< "Unknown mode: " << Caffe::mode();
+  }
+  return penalty;
+}
+
+template<typename Dtype>
+Dtype L1Regularizer<Dtype>::Regularize_cpu(Blob<Dtype>* bottom) {
+  if (this->coeff_ == 0) {
+    return Dtype(0.);
+  }
+  const Dtype* data = bottom->cpu_data();
+  Dtype* diff = bottom->mutable_cpu_diff();
+  int count = bottom->count();
+  for (int c = 0; c < count; ++c) {
+    diff[c] += this->coeff_ * caffe_sign<Dtype>(data[c]);
+  }
+  Dtype penalty = caffe_cpu_asum(count, data);
+  return this->coeff_ * penalty;
+}
+
+template<typename Dtype>
+Dtype L2Regularizer<Dtype>::Regularize_cpu(Blob<Dtype>* bottom) {
+  if (this->coeff_ == 0) {
+    return Dtype(0);
+  }
+  const Dtype* data = bottom->cpu_data();
+  Dtype* diff = bottom->mutable_cpu_diff();
+  int count = bottom->count();
+  caffe_axpy<Dtype>(count, this->coeff_ * 2., data, diff);
+  Dtype penalty = caffe_cpu_dot<Dtype>(count, data, data);
+  return this->coeff_ * penalty;
+}
+
+template<typename Dtype>
+Dtype MaxNormRegularizer<Dtype>::Regularize_cpu(Blob<Dtype>* bottom) {
+  if (this->coeff_ == 0) {
+    return Dtype(0);
+  }
+  const Dtype* data = bottom->cpu_data();
+  Dtype* diff = bottom->mutable_cpu_diff();
+  int count = bottom->count();
+  Dtype penalty = 0;
+  // TODO: Implement MaxNormRegularizer::Regularize_cpu
+  return this->coeff_ * penalty;
+}
+
+template<typename Dtype>
+Regularizer<Dtype>* GetRegularizer(const RegularizerParameter& param) {
+  const RegularizerParameter_RegularizerType type = param.type();
+  if (type == REG_TYPE(L1)) {
+    return new L1Regularizer<Dtype>(param);
+  } else if (type == REG_TYPE(L2)) {
+    return new L2Regularizer<Dtype>(param);
+  } else if (type == REG_TYPE(MAX_NORM)) {
+    return new MaxNormRegularizer<Dtype>(param);
+  } else {
+    LOG(FATAL) << "Unknown regularizer type: " << type;
+  }
+  // just to suppress old compiler warnings.
+  return (Regularizer<Dtype>*) (NULL);
+}
+
+template Regularizer<float>* GetRegularizer<float>(
+    const RegularizerParameter& param);
+template Regularizer<double>* GetRegularizer<double>(
+    const RegularizerParameter& param);
+
+INSTANTIATE_CLASS(Regularizer);
+INSTANTIATE_CLASS(L1Regularizer);
+INSTANTIATE_CLASS(L2Regularizer);
+INSTANTIATE_CLASS(MaxNormRegularizer);
+
+}  // namespace caffe
diff --git a/src/caffe/regularizer.cu b/src/caffe/regularizer.cu
new file mode 100644
index 00000000000..973ed4d6c17
--- /dev/null
+++ b/src/caffe/regularizer.cu
@@ -0,0 +1,77 @@
+// Copyright 2014 kloudkl@github
+
+#include <cmath>  // for std::abs
+
+#include "caffe/common.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/regularizer.hpp"
+#include "caffe/util/math_functions.hpp"  // for caffe_gpu_asum
+
+namespace caffe {
+
+template <typename Dtype>
+__device__ inline int gpu_sign(const Dtype val) {
+  return (Dtype(0) < val) - (val < Dtype(0));
+}
+
+template __device__ int gpu_sign<float>(const float val);
+template __device__ int gpu_sign<double>(const double val);
+
+template <typename Dtype>
+__global__ void ScaleSign(const int n, const Dtype coeff, const Dtype* data,
+                          Dtype* diff) {
+  CUDA_KERNEL_LOOP(index, n) {
+    diff[index] += coeff * gpu_sign<Dtype>(data[index]);
+  }
+}
+
+template<typename Dtype>
+Dtype L1Regularizer<Dtype>::Regularize_gpu(Blob<Dtype>* bottom) {
+  if (this->coeff_ == 0) {
+    return Dtype(0);
+  }
+  const Dtype* data = bottom->gpu_data();
+  Dtype* diff = bottom->mutable_gpu_diff();
+  int count = bottom->count();
+  /* NOLINT_NEXT_LINE(whitespace/operators) */
+  ScaleSign<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, this->coeff_, data, diff);
+  CUDA_POST_KERNEL_CHECK;
+  Dtype penalty = 0;
+  caffe_gpu_asum<Dtype>(count, data, &penalty);
+  return this->coeff_ * penalty;
+}
+
+template<typename Dtype>
+Dtype L2Regularizer<Dtype>::Regularize_gpu(Blob<Dtype>* bottom) {
+  if (this->coeff_ == 0) {
+    return Dtype(0);
+  }
+  const Dtype* data = bottom->gpu_data();
+  Dtype* diff = bottom->mutable_gpu_diff();
+  int count = bottom->count();
+  caffe_gpu_axpy<Dtype>(count, this->coeff_ * 2., data, diff);
+  Dtype penalty = 0;
+  caffe_gpu_dot<Dtype>(count, data, data, &penalty);
+  return this->coeff_ * penalty;
+}
+
+template<typename Dtype>
+Dtype MaxNormRegularizer<Dtype>::Regularize_gpu(Blob<Dtype>* bottom) {
+  if (this->coeff_ == 0) {
+    return Dtype(0);
+  }
+  const Dtype* data = bottom->cpu_data();
+  Dtype* diff = bottom->mutable_cpu_diff();
+  int count = bottom->count();
+  Dtype penalty = 0;
+  // TODO: Implement MaxNormRegularizer::Regularize_cpu
+  return this->coeff_ * penalty;
+}
+
+INSTANTIATE_CLASS(Regularizer);
+INSTANTIATE_CLASS(L1Regularizer);
+INSTANTIATE_CLASS(L2Regularizer);
+INSTANTIATE_CLASS(MaxNormRegularizer);
+
+}  // namespace caffe
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index eb024856841..fb46c4ec4f3 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -215,7 +215,7 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
       // Compute the value to history, and then copy them to the blob's diff.
       Dtype local_rate = rate * net_params_lr[param_id];
       Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
-      caffe_axpby(net_params[param_id]->count(), local_rate,
+      caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
           net_params[param_id]->cpu_diff(), momentum,
           history_[param_id]->mutable_cpu_data());
       if (local_decay) {
diff --git a/src/caffe/test/test_common.cpp b/src/caffe/test/test_common.cpp
index 275c6e1bf73..12e7168867f 100644
--- a/src/caffe/test/test_common.cpp
+++ b/src/caffe/test/test_common.cpp
@@ -6,7 +6,7 @@
 #include "gtest/gtest.h"
 #include "caffe/common.hpp"
 #include "caffe/syncedmem.hpp"
-
+#include "caffe/util/math_functions.hpp"
 #include "caffe/test/test_caffe_main.hpp"
 
 namespace caffe {
@@ -19,10 +19,6 @@ TEST_F(CommonTest, TestCublasHandler) {
   EXPECT_TRUE(Caffe::cublas_handle());
 }
 
-TEST_F(CommonTest, TestVslStream) {
-  EXPECT_TRUE(Caffe::vsl_stream());
-}
-
 TEST_F(CommonTest, TestBrewMode) {
   Caffe::set_mode(Caffe::CPU);
   EXPECT_EQ(Caffe::mode(), Caffe::CPU);
@@ -40,18 +36,19 @@ TEST_F(CommonTest, TestRandSeedCPU) {
   SyncedMemory data_a(10 * sizeof(int));
   SyncedMemory data_b(10 * sizeof(int));
   Caffe::set_random_seed(1701);
-  viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
-        10, reinterpret_cast<int*>(data_a.mutable_cpu_data()), 0.5);
+  caffe_vRngBernoulli(10,
+      reinterpret_cast<int*>(data_a.mutable_cpu_data()), 0.5);
+
   Caffe::set_random_seed(1701);
-  viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
-        10, reinterpret_cast<int*>(data_b.mutable_cpu_data()), 0.5);
+  caffe_vRngBernoulli(10,
+      reinterpret_cast<int*>(data_b.mutable_cpu_data()), 0.5);
+
   for (int i = 0; i < 10; ++i) {
     EXPECT_EQ(((const int*)(data_a.cpu_data()))[i],
         ((const int*)(data_b.cpu_data()))[i]);
   }
 }
 
-
 TEST_F(CommonTest, TestRandSeedGPU) {
   SyncedMemory data_a(10 * sizeof(unsigned int));
   SyncedMemory data_b(10 * sizeof(unsigned int));
@@ -67,5 +64,4 @@ TEST_F(CommonTest, TestRandSeedGPU) {
   }
 }
 
-
 }  // namespace caffe
diff --git a/src/caffe/test/test_flatten_layer.cpp b/src/caffe/test/test_flatten_layer.cpp
index 41c0453696c..f241135db57 100644
--- a/src/caffe/test/test_flatten_layer.cpp
+++ b/src/caffe/test/test_flatten_layer.cpp
@@ -23,6 +23,7 @@ class FlattenLayerTest : public ::testing::Test {
   FlattenLayerTest()
       : blob_bottom_(new Blob<Dtype>(2, 3, 6, 5)),
         blob_top_(new Blob<Dtype>()) {
+    Caffe::set_random_seed(1701);
     // fill the values
     FillerParameter filler_param;
     GaussianFiller<Dtype> filler(filler_param);
@@ -73,6 +74,8 @@ TYPED_TEST(FlattenLayerTest, TestGPU) {
   for (int c = 0; c < 3 * 6 * 5; ++c) {
     EXPECT_EQ(this->blob_top_->data_at(0, c, 0, 0),
         this->blob_bottom_->data_at(0, c / (6 * 5), (c / 5) % 6, c % 5));
+    EXPECT_EQ(this->blob_top_->data_at(1, c, 0, 0),
+        this->blob_bottom_->data_at(1, c / (6 * 5), (c / 5) % 6, c % 5));
   }
 }
 
diff --git a/src/caffe/test/test_gradient_check_util.hpp b/src/caffe/test/test_gradient_check_util.hpp
index 895e9965a9a..6e895241f44 100644
--- a/src/caffe/test/test_gradient_check_util.hpp
+++ b/src/caffe/test/test_gradient_check_util.hpp
@@ -84,31 +84,30 @@ void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer,
   }
   // go through the bottom and parameter blobs
   // LOG(ERROR) << "Checking " << blobs_to_check.size() << " blobs.";
-  for (int blobid = 0; blobid < blobs_to_check.size(); ++blobid) {
-    Blob<Dtype>* current_blob = blobs_to_check[blobid];
-    // LOG(ERROR) << "Blob " << blobid << ": checking " << current_blob->count()
-    //     << " parameters.";
+  for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) {
+    Blob<Dtype>* current_blob = blobs_to_check[blob_id];
+    // LOG(ERROR) << "Blob " << blob_id << ": checking "
+    //     << current_blob->count() << " parameters.";
     // go through the values
     for (int feat_id = 0; feat_id < current_blob->count(); ++feat_id) {
       // First, obtain the original data
       Caffe::set_random_seed(seed_);
-      layer->Forward(*bottom, top);
-      Dtype computed_objective = GetObjAndGradient(top, top_id, top_data_id);
-      // Get any additional loss from the layer
-      computed_objective += layer->Backward(*top, true, bottom);
+      // Get any loss from the layer
+      Dtype computed_objective = layer->Forward(*bottom, top);
+      // Get additional loss from the objective
+      computed_objective += GetObjAndGradient(top, top_id, top_data_id);
+      layer->Backward(*top, true, bottom);
       Dtype computed_gradient = current_blob->cpu_diff()[feat_id];
       // compute score by adding stepsize
       current_blob->mutable_cpu_data()[feat_id] += stepsize_;
       Caffe::set_random_seed(seed_);
-      layer->Forward(*bottom, top);
-      Dtype positive_objective = GetObjAndGradient(top, top_id, top_data_id);
-      positive_objective += layer->Backward(*top, true, bottom);
+      Dtype positive_objective = layer->Forward(*bottom, top);
+      positive_objective += GetObjAndGradient(top, top_id, top_data_id);
       // compute score by subtracting stepsize
       current_blob->mutable_cpu_data()[feat_id] -= stepsize_ * 2;
       Caffe::set_random_seed(seed_);
-      layer->Forward(*bottom, top);
-      Dtype negative_objective = GetObjAndGradient(top, top_id, top_data_id);
-      negative_objective += layer->Backward(*top, true, bottom);
+      Dtype negative_objective = layer->Forward(*bottom, top);
+      negative_objective += GetObjAndGradient(top, top_id, top_data_id);
       // Recover stepsize
       current_blob->mutable_cpu_data()[feat_id] += stepsize_;
       Dtype estimated_gradient = (positive_objective - negative_objective) /
@@ -123,7 +122,7 @@ void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer,
             max(fabs(computed_gradient), fabs(estimated_gradient)), 1.);
         EXPECT_NEAR(computed_gradient, estimated_gradient, threshold_ * scale)
           << "debug: (top_id, top_data_id, blob_id, feat_id)="
-          << top_id << "," << top_data_id << "," << blobid << "," << feat_id;
+          << top_id << "," << top_data_id << "," << blob_id << "," << feat_id;
       }
       // LOG(ERROR) << "Feature: " << current_blob->cpu_data()[feat_id];
       // LOG(ERROR) << "computed gradient: " << computed_gradient
diff --git a/src/caffe/test/test_hdf5_output_layer.cpp b/src/caffe/test/test_hdf5_output_layer.cpp
new file mode 100644
index 00000000000..3cbfb3f35a8
--- /dev/null
+++ b/src/caffe/test/test_hdf5_output_layer.cpp
@@ -0,0 +1,127 @@
+// Copyright 2014 kloudkl@github
+
+#include <cuda_runtime.h>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/util/io.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+using std::string;
+using std::vector;
+
+extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+
+template <typename Dtype>
+class HDF5OutputLayerTest : public ::testing::Test {
+ protected:
+  HDF5OutputLayerTest()
+      : output_file_name_("/tmp/test_hdf5_output_layer-sample_data.hdf5"),
+        input_file_name_("src/caffe/test/test_data/sample_data.h5"),
+        blob_data_(new Blob<Dtype>()),
+        blob_label_(new Blob<Dtype>()),
+        num_(5),
+        channels_(8),
+        height_(5),
+        width_(5) {
+  }
+  virtual void SetUp() {
+  }
+
+  virtual ~HDF5OutputLayerTest() {
+    delete blob_data_;
+    delete blob_label_;
+  }
+
+  void CheckBlobEqual(const Blob<Dtype>& b1, const Blob<Dtype>& b2);
+
+  string output_file_name_;
+  string input_file_name_;
+  Blob<Dtype>* const blob_data_;
+  Blob<Dtype>* const blob_label_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+  int num_;
+  int channels_;
+  int height_;
+  int width_;
+};
+
+template <typename Dtype>
+void HDF5OutputLayerTest<Dtype>::CheckBlobEqual(
+    const Blob<Dtype>& b1, const Blob<Dtype>& b2) {
+  EXPECT_EQ(b1.num(), b2.num());
+  EXPECT_EQ(b1.channels(), b2.channels());
+  EXPECT_EQ(b1.height(), b2.height());
+  EXPECT_EQ(b1.width(), b2.width());
+  for (int n = 0; n < b1.num(); ++n) {
+    for (int c = 0; c < b1.channels(); ++c) {
+      for (int h = 0; h < b1.height(); ++h) {
+        for (int w = 0; w < b1.width(); ++w) {
+          EXPECT_EQ(b1.data_at(n, c, h, w), b1.data_at(n, c, h, w));
+        }
+      }
+    }
+  }
+}
+
+typedef ::testing::Types<float, double> Dtypes;
+TYPED_TEST_CASE(HDF5OutputLayerTest, Dtypes);
+
+TYPED_TEST(HDF5OutputLayerTest, TestForward) {
+  LOG(INFO) << "Loading HDF5 file " << this->input_file_name_;
+  hid_t file_id = H5Fopen(this->input_file_name_.c_str(), H5F_ACC_RDONLY,
+                          H5P_DEFAULT);
+  ASSERT_GE(file_id, 0) << "Failed to open HDF5 file" <<
+      this->input_file_name_;
+  hdf5_load_nd_dataset(file_id, HDF5_DATA_DATASET_NAME, 0, 4,
+                       this->blob_data_);
+  hdf5_load_nd_dataset(file_id, HDF5_DATA_LABEL_NAME, 0, 4,
+                       this->blob_label_);
+  herr_t status = H5Fclose(file_id);
+  EXPECT_GE(status, 0) << "Failed to close HDF5 file " <<
+      this->input_file_name_;
+  this->blob_bottom_vec_.push_back(this->blob_data_);
+  this->blob_bottom_vec_.push_back(this->blob_label_);
+
+  Caffe::Brew modes[] = { Caffe::CPU, Caffe::GPU };
+  for (int m = 0; m < 2; ++m) {
+    Caffe::set_mode(modes[m]);
+    LayerParameter param;
+    param.mutable_hdf5_output_param()->set_file_name(this->output_file_name_);
+    // This code block ensures that the layer is deconstructed and
+    //   the output hdf5 file is closed.
+    {
+      HDF5OutputLayer<TypeParam> layer(param);
+      EXPECT_EQ(layer.file_name(), this->output_file_name_);
+      layer.SetUp(this->blob_bottom_vec_, &this->blob_top_vec_);
+      layer.Forward(this->blob_bottom_vec_, &this->blob_top_vec_);
+    }
+    hid_t file_id = H5Fopen(this->output_file_name_.c_str(), H5F_ACC_RDONLY,
+                            H5P_DEFAULT);
+    ASSERT_GE(file_id, 0) << "Failed to open HDF5 file" <<
+        this->input_file_name_;
+
+    Blob<TypeParam>* blob_data = new Blob<TypeParam>();
+    hdf5_load_nd_dataset(file_id, HDF5_DATA_DATASET_NAME, 0, 4,
+                         blob_data);
+    this->CheckBlobEqual(*(this->blob_data_), *blob_data);
+
+    Blob<TypeParam>* blob_label = new Blob<TypeParam>();
+    hdf5_load_nd_dataset(file_id, HDF5_DATA_LABEL_NAME, 0, 4,
+                         blob_label);
+    this->CheckBlobEqual(*(this->blob_label_), *blob_label);
+
+    herr_t status = H5Fclose(file_id);
+    EXPECT_GE(status, 0) << "Failed to close HDF5 file " <<
+        this->output_file_name_;
+  }
+}
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_math_functions.cpp b/src/caffe/test/test_math_functions.cpp
new file mode 100644
index 00000000000..ca059a9147c
--- /dev/null
+++ b/src/caffe/test/test_math_functions.cpp
@@ -0,0 +1,195 @@
+// Copyright 2014 kloudkl@github
+
+#include <stdint.h>  // for uint32_t & uint64_t
+#include <time.h>
+#include <climits>
+#include <cmath>  // for std::fabs
+#include <cstdlib>  // for rand_r
+
+#include "gtest/gtest.h"
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/util/math_functions.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+template<typename Dtype>
+class MathFunctionsTest : public ::testing::Test {
+ protected:
+  MathFunctionsTest()
+      : blob_bottom_(new Blob<Dtype>()),
+        blob_top_(new Blob<Dtype>()) {
+  }
+
+  virtual void SetUp() {
+    Caffe::set_random_seed(1701);
+    this->blob_bottom_->Reshape(11, 17, 19, 23);
+    this->blob_top_->Reshape(11, 17, 19, 23);
+    // fill the values
+    FillerParameter filler_param;
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    filler.Fill(this->blob_top_);
+  }
+
+  virtual ~MathFunctionsTest() {
+    delete blob_bottom_;
+    delete blob_top_;
+  }
+  // http://en.wikipedia.org/wiki/Hamming_distance
+  int ReferenceHammingDistance(const int n, const Dtype* x, const Dtype* y);
+
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_top_;
+};
+
+#define REF_HAMMING_DIST(float_type, int_type) \
+template<> \
+int MathFunctionsTest<float_type>::ReferenceHammingDistance(const int n, \
+                                                       const float_type* x, \
+                                                       const float_type* y) { \
+  int dist = 0; \
+  int_type val; \
+  for (int i = 0; i < n; ++i) { \
+    val = static_cast<int_type>(x[i]) ^ static_cast<int_type>(y[i]); \
+    /* Count the number of set bits */ \
+    while (val) { \
+      ++dist; \
+      val &= val - 1; \
+    } \
+  } \
+  return dist; \
+}
+
+REF_HAMMING_DIST(float, uint32_t);
+REF_HAMMING_DIST(double, uint64_t);
+
+typedef ::testing::Types<float, double> Dtypes;
+TYPED_TEST_CASE(MathFunctionsTest, Dtypes);
+
+TYPED_TEST(MathFunctionsTest, TestHammingDistance) {
+  int n = this->blob_bottom_->count();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  const TypeParam* y = this->blob_top_->cpu_data();
+  CHECK_EQ(this->ReferenceHammingDistance(n, x, y),
+           caffe_hamming_distance<TypeParam>(n, x, y));
+}
+
+TYPED_TEST(MathFunctionsTest, TestAsumCPU) {
+  int n = this->blob_bottom_->count();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  TypeParam std_asum = 0;
+  for (int i = 0; i < n; ++i) {
+    std_asum += std::fabs(x[i]);
+  }
+  TypeParam cpu_asum = caffe_cpu_asum<TypeParam>(n, x);
+  CHECK_LT((cpu_asum - std_asum) / std_asum, 1e-2);
+}
+
+TYPED_TEST(MathFunctionsTest, TestAsumGPU) {
+  int n = this->blob_bottom_->count();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  TypeParam std_asum = 0;
+  for (int i = 0; i < n; ++i) {
+    std_asum += std::fabs(x[i]);
+  }
+  TypeParam gpu_asum;
+  caffe_gpu_asum<TypeParam>(n, this->blob_bottom_->gpu_data(), &gpu_asum);
+  CHECK_LT((gpu_asum - std_asum) / std_asum, 1e-2);
+}
+
+TYPED_TEST(MathFunctionsTest, TestSignCPU) {
+  int n = this->blob_bottom_->count();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  caffe_cpu_sign<TypeParam>(n, x, this->blob_bottom_->mutable_cpu_diff());
+  const TypeParam* signs = this->blob_bottom_->cpu_diff();
+  for (int i = 0; i < n; ++i) {
+    CHECK_EQ(signs[i], x[i] > 0 ? 1 : (x[i] < 0 ? -1 : 0));
+  }
+}
+
+TYPED_TEST(MathFunctionsTest, TestSignGPU) {
+  int n = this->blob_bottom_->count();
+  caffe_gpu_sign<TypeParam>(n, this->blob_bottom_->gpu_data(),
+                            this->blob_bottom_->mutable_gpu_diff());
+  const TypeParam* signs = this->blob_bottom_->cpu_diff();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  for (int i = 0; i < n; ++i) {
+    CHECK_EQ(signs[i], x[i] > 0 ? 1 : (x[i] < 0 ? -1 : 0));
+  }
+}
+
+TYPED_TEST(MathFunctionsTest, TestSgnbitCPU) {
+  int n = this->blob_bottom_->count();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  caffe_cpu_sgnbit<TypeParam>(n, x, this->blob_bottom_->mutable_cpu_diff());
+  const TypeParam* signbits = this->blob_bottom_->cpu_diff();
+  for (int i = 0; i < n; ++i) {
+    CHECK_EQ(signbits[i], x[i] < 0 ? 1 : 0);
+  }
+}
+
+TYPED_TEST(MathFunctionsTest, TestSgnbitGPU) {
+  int n = this->blob_bottom_->count();
+  caffe_gpu_sgnbit<TypeParam>(n, this->blob_bottom_->gpu_data(),
+                            this->blob_bottom_->mutable_gpu_diff());
+  const TypeParam* signbits = this->blob_bottom_->cpu_diff();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  for (int i = 0; i < n; ++i) {
+    CHECK_EQ(signbits[i], x[i] < 0 ? 1 : 0);
+  }
+}
+
+TYPED_TEST(MathFunctionsTest, TestFabsCPU) {
+  int n = this->blob_bottom_->count();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  caffe_cpu_fabs<TypeParam>(n, x, this->blob_bottom_->mutable_cpu_diff());
+  const TypeParam* abs_val = this->blob_bottom_->cpu_diff();
+  for (int i = 0; i < n; ++i) {
+    CHECK_EQ(abs_val[i], x[i] > 0 ? x[i] : -x[i]);
+  }
+}
+
+TYPED_TEST(MathFunctionsTest, TestFabsGPU) {
+  int n = this->blob_bottom_->count();
+  caffe_gpu_fabs<TypeParam>(n, this->blob_bottom_->gpu_data(),
+                            this->blob_bottom_->mutable_gpu_diff());
+  const TypeParam* abs_val = this->blob_bottom_->cpu_diff();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  for (int i = 0; i < n; ++i) {
+    CHECK_EQ(abs_val[i], x[i] > 0 ? x[i] : -x[i]);
+  }
+}
+
+TYPED_TEST(MathFunctionsTest, TestScaleCPU) {
+  int n = this->blob_bottom_->count();
+  // NOLINT_NEXT_LINE(runtime/threadsafe_fn)
+  TypeParam alpha = this->blob_bottom_->cpu_diff()[rand() %
+                                               this->blob_bottom_->count()];
+  caffe_cpu_scale<TypeParam>(n, alpha, this->blob_bottom_->cpu_data(),
+                             this->blob_bottom_->mutable_cpu_diff());
+  const TypeParam* scaled = this->blob_bottom_->cpu_diff();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  for (int i = 0; i < n; ++i) {
+    CHECK_EQ(scaled[i], x[i] * alpha);
+  }
+}
+
+TYPED_TEST(MathFunctionsTest, TestScaleGPU) {
+  int n = this->blob_bottom_->count();
+  // NOLINT_NEXT_LINE(runtime/threadsafe_fn)
+  TypeParam alpha = this->blob_bottom_->cpu_diff()[rand() %
+                                               this->blob_bottom_->count()];
+  caffe_gpu_scale<TypeParam>(n, alpha, this->blob_bottom_->gpu_data(),
+                             this->blob_bottom_->mutable_gpu_diff());
+  const TypeParam* scaled = this->blob_bottom_->cpu_diff();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  for (int i = 0; i < n; ++i) {
+    CHECK_EQ(scaled[i], x[i] * alpha);
+  }
+}
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_multinomial_logistic_loss_layer.cpp b/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
index 5169b708520..5a61df79d89 100644
--- a/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
+++ b/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
@@ -25,6 +25,7 @@ class MultinomialLogisticLossLayerTest : public ::testing::Test {
   MultinomialLogisticLossLayerTest()
       : blob_bottom_data_(new Blob<Dtype>(10, 5, 1, 1)),
         blob_bottom_label_(new Blob<Dtype>(10, 1, 1, 1)) {
+    Caffe::set_random_seed(1701);
     // fill the values
     FillerParameter filler_param;
     PositiveUnitballFiller<Dtype> filler(filler_param);
@@ -55,7 +56,7 @@ TYPED_TEST(MultinomialLogisticLossLayerTest, TestGradientCPU) {
   Caffe::set_mode(Caffe::CPU);
   MultinomialLogisticLossLayer<TypeParam> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, &this->blob_top_vec_);
-  GradientChecker<TypeParam> checker(1e-2, 1e-2, 1701, 0, 0.05);
+  GradientChecker<TypeParam> checker(1e-2, 2*1e-2, 1701, 0, 0.05);
   checker.CheckGradientSingle(&layer, &(this->blob_bottom_vec_),
       &(this->blob_top_vec_), 0, -1, -1);
 }
diff --git a/src/caffe/test/test_net.cpp b/src/caffe/test/test_net.cpp
new file mode 100644
index 00000000000..fd7265c47df
--- /dev/null
+++ b/src/caffe/test/test_net.cpp
@@ -0,0 +1,148 @@
+// Copyright 2014 kloudkl@github
+
+#include <google/protobuf/text_format.h>
+#include <leveldb/db.h>
+#include <sstream>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "caffe/common.hpp"
+#include "caffe/net.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+
+template <typename Dtype>
+class NetTest : public ::testing::Test {
+ protected:
+  NetTest() : filename(NULL) {
+  }
+
+  virtual void SetUp() {  // Create the leveldb
+    filename = tmpnam(NULL);  // get temp name
+    LOG(INFO) << "Using temporary leveldb " << filename;
+    leveldb::DB* db;
+    leveldb::Options options;
+    options.error_if_exists = true;
+    options.create_if_missing = true;
+    leveldb::Status status = leveldb::DB::Open(options, filename, &db);
+    CHECK(status.ok());
+    for (int i = 0; i < 5; ++i) {
+      Datum datum;
+      datum.set_label(i);
+      datum.set_channels(2);
+      datum.set_height(3);
+      datum.set_width(4);
+      std::string* data = datum.mutable_data();
+      for (int j = 0; j < 24; ++j) {
+        data->push_back((uint8_t)i);
+      }
+      std::stringstream ss;
+      ss << i;
+      db->Put(leveldb::WriteOptions(), ss.str(), datum.SerializeAsString());
+    }
+    delete db;
+
+    const string& proto_prefix =
+        "name: 'TestNetwork' "
+        "layers: { "
+        "  layer { "
+        "    name: 'data' "
+        "    type: 'data' ";
+    const string& proto_suffix =
+        "    batchsize: 1 "
+        "  } "
+        "  top: 'data' "
+        "  top: 'label' "
+        "} "
+        "layers: { "
+        "  layer { "
+        "    name: 'innerproduct' "
+        "    type: 'innerproduct' "
+        "    num_output: 1000 "
+        "    weight_filler { "
+        "      type: 'gaussian' "
+        "      std: 0.01 "
+        "    } "
+        "    bias_filler { "
+        "      type: 'constant' "
+        "      value: 0 "
+        "    } "
+        "    blobs_lr: 1. "
+        "    blobs_lr: 2. "
+        "    weight_decay: 1. "
+        "    weight_decay: 0. "
+        "  } "
+        "  bottom: 'data' "
+        "  top: 'innerproduct' "
+        "} "
+        "layers: { "
+        "  layer { "
+        "    name: 'loss' "
+        "    type: 'softmax_loss' "
+        "  } "
+        "  bottom: 'innerproduct' "
+        "  bottom: 'label' "
+        "} ";
+    proto = proto_prefix + "source: '" + string(this->filename) +
+        "' " + proto_suffix;
+  }
+
+  virtual ~NetTest() {
+  }
+
+  char* filename;
+  string proto;
+};
+
+typedef ::testing::Types<float, double> Dtypes;
+TYPED_TEST_CASE(NetTest, Dtypes);
+
+TYPED_TEST(NetTest, TestHasBlob) {
+  NetParameter param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(this->proto,
+                                                      &param));
+  Net<TypeParam> net(param);
+  EXPECT_TRUE(net.has_blob("data"));
+  EXPECT_TRUE(net.has_blob("label"));
+  EXPECT_TRUE(net.has_blob("innerproduct"));
+  EXPECT_FALSE(net.has_blob("loss"));
+}
+
+TYPED_TEST(NetTest, TestGetBlob) {
+  NetParameter param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(this->proto,
+                                                      &param));
+  Net<TypeParam> net(param);
+  EXPECT_EQ(net.blob_by_name("data"), net.blobs()[0]);
+  EXPECT_EQ(net.blob_by_name("label"), net.blobs()[1]);
+  EXPECT_EQ(net.blob_by_name("innerproduct"), net.blobs()[2]);
+  EXPECT_FALSE(net.blob_by_name("loss"));
+}
+
+TYPED_TEST(NetTest, TestHasLayer) {
+  NetParameter param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(this->proto,
+                                                      &param));
+  Net<TypeParam> net(param);
+  EXPECT_TRUE(net.has_layer("data"));
+  EXPECT_TRUE(net.has_layer("innerproduct"));
+  EXPECT_TRUE(net.has_layer("loss"));
+  EXPECT_FALSE(net.has_layer("label"));
+}
+
+TYPED_TEST(NetTest, TestGetLayerByName) {
+  NetParameter param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(this->proto,
+                                                      &param));
+  Net<TypeParam> net(param);
+  EXPECT_EQ(net.layer_by_name("data"), net.layers()[0]);
+  EXPECT_EQ(net.layer_by_name("innerproduct"), net.layers()[1]);
+  EXPECT_EQ(net.layer_by_name("loss"), net.layers()[2]);
+  EXPECT_FALSE(net.layer_by_name("label"));
+}
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_random_number_generator.cpp b/src/caffe/test/test_random_number_generator.cpp
new file mode 100644
index 00000000000..267e7731475
--- /dev/null
+++ b/src/caffe/test/test_random_number_generator.cpp
@@ -0,0 +1,98 @@
+// Copyright 2014 BVLC and contributors.
+
+#include <cuda_runtime.h>
+#include <cmath>
+#include <cstring>
+
+#include "gtest/gtest.h"
+#include "caffe/common.hpp"
+#include "caffe/syncedmem.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+class RandomNumberGeneratorTest : public ::testing::Test {
+ public:
+  virtual ~RandomNumberGeneratorTest() {}
+
+  Dtype sample_mean(const Dtype* const seqs, const size_t sample_size) {
+      double sum = 0;
+      for (int i = 0; i < sample_size; ++i) {
+          sum += seqs[i];
+      }
+      return sum / sample_size;
+  }
+
+  Dtype sample_mean(const int* const seqs, const size_t sample_size) {
+      Dtype sum = 0;
+      for (int i = 0; i < sample_size; ++i) {
+          sum += Dtype(seqs[i]);
+      }
+      return sum / sample_size;
+  }
+
+  Dtype mean_bound(const Dtype std, const size_t sample_size) {
+      return  std/sqrt(static_cast<double>(sample_size));
+  }
+};
+
+
+typedef ::testing::Types<float, double> Dtypes;
+TYPED_TEST_CASE(RandomNumberGeneratorTest, Dtypes);
+
+
+TYPED_TEST(RandomNumberGeneratorTest, TestRngGaussian) {
+  size_t sample_size = 10000;
+  SyncedMemory data_a(sample_size * sizeof(TypeParam));
+  Caffe::set_random_seed(1701);
+  TypeParam mu = 0;
+  TypeParam sigma = 1;
+  caffe_vRngGaussian(sample_size,
+      reinterpret_cast<TypeParam*>(data_a.mutable_cpu_data()), mu, sigma);
+  TypeParam true_mean = mu;
+  TypeParam true_std = sigma;
+  TypeParam bound = this->mean_bound(true_std, sample_size);
+  TypeParam empirical_mean =
+      this->sample_mean(reinterpret_cast<const TypeParam*>(data_a.cpu_data()),
+          sample_size);
+  EXPECT_NEAR(empirical_mean, true_mean, bound);
+}
+
+
+TYPED_TEST(RandomNumberGeneratorTest, TestRngUniform) {
+  size_t sample_size = 10000;
+  SyncedMemory data_a(sample_size * sizeof(TypeParam));
+  Caffe::set_random_seed(1701);
+  TypeParam lower = 0;
+  TypeParam upper = 1;
+  caffe_vRngUniform(sample_size,
+      reinterpret_cast<TypeParam*>(data_a.mutable_cpu_data()), lower, upper);
+  TypeParam true_mean = (lower + upper) / 2;
+  TypeParam true_std = (upper - lower) / sqrt(12);
+  TypeParam bound = this->mean_bound(true_std, sample_size);
+  TypeParam empirical_mean =
+      this->sample_mean(reinterpret_cast<const TypeParam*>(data_a.cpu_data()),
+          sample_size);
+  EXPECT_NEAR(empirical_mean, true_mean, bound);
+}
+
+
+TYPED_TEST(RandomNumberGeneratorTest, TestRngBernoulli) {
+  size_t sample_size = 10000;
+  SyncedMemory data_a(sample_size * sizeof(int));
+  Caffe::set_random_seed(1701);
+  double p = 0.3;
+  caffe_vRngBernoulli(sample_size,
+      static_cast<int*>(data_a.mutable_cpu_data()), p);
+  TypeParam true_mean = p;
+  TypeParam true_std = sqrt(p * (1 - p));
+  TypeParam bound = this->mean_bound(true_std, sample_size);
+  TypeParam empirical_mean =
+      this->sample_mean((const int *)data_a.cpu_data(), sample_size);
+  EXPECT_NEAR(empirical_mean, true_mean, bound);
+}
+
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_regularizer_as_loss_layer.cpp b/src/caffe/test/test_regularizer_as_loss_layer.cpp
new file mode 100644
index 00000000000..fb65d200f87
--- /dev/null
+++ b/src/caffe/test/test_regularizer_as_loss_layer.cpp
@@ -0,0 +1,154 @@
+// Copyright 2014 kloudkl@github
+
+#include <cuda_runtime.h>
+#include <cstring>  // for memset
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/regularizer.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+#include "caffe/vision_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "gtest/gtest.h"
+
+namespace caffe {
+
+extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+
+template<typename Dtype>
+class RegularizationAsLossTest : public ::testing::Test {
+ protected:
+  RegularizationAsLossTest()
+      : blob_bottom_data_(new Blob<Dtype>(10, 5, 3, 2)) {
+    // fill the values
+    FillerParameter filler_param;
+    filler_param.set_std(10);
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_data_);
+    blob_bottom_vec_.push_back(blob_bottom_data_);
+  }
+  virtual ~RegularizationAsLossTest() {
+    delete blob_bottom_data_;
+  }
+
+  void Check(const bool death_condition,
+                      const LayerParameter& layer_param, const Dtype step_size,
+                      const Dtype threshold, const unsigned int seed = 1701);
+
+  Blob<Dtype>* const blob_bottom_data_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+typedef ::testing::Types<float, double> Dtypes;
+TYPED_TEST_CASE(RegularizationAsLossTest, Dtypes);
+
+// The death test only abort the current function
+// http://code.google.com/p/googletest/wiki/V1_6_AdvancedGuide
+//        #Propagating_Fatal_Failures
+// We want to test all the combinations of coefficients.
+// If this subroutine is place in the test cases directly,
+// the test cases cannot enumerate the combinations after the first failure.
+template<typename Dtype>
+void RegularizationAsLossTest<Dtype>::Check(
+    const bool is_death_condition, const LayerParameter& layer_param,
+    const Dtype step_size, const Dtype threshold, const unsigned int seed) {
+  if (is_death_condition) {
+    ASSERT_DEATH(
+        RegularizerAsLossLayer<Dtype> layer(layer_param),
+        "Regularizer coefficient must be greater than or equal to zero");
+  } else {
+    RegularizerAsLossLayer<Dtype> layer(layer_param);
+    layer.SetUp(this->blob_bottom_vec_, &this->blob_top_vec_);
+    GradientChecker<Dtype> checker(step_size, threshold, seed);
+    for (int loop = 0; loop < 10; ++loop) {
+      checker.CheckGradientSingle(&layer, &(this->blob_bottom_vec_),
+                                  &(this->blob_top_vec_), 0, -1, -1);
+    }
+  }
+}
+
+// ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+// To suppress Google Test warning of death tests running in multiple threads
+// http://code.google.com/p/googletest/wiki/AdvancedGuide#Death_Test_Styles
+#define TEST_REG_LOSS_LAYER_SINGLE_TYPE(mode, regularizer) \
+TYPED_TEST(RegularizationAsLossTest, TestGradient##mode##_##regularizer) { \
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe"; \
+  Caffe::set_mode(Caffe::mode); \
+  TypeParam coeff[] = {1, 0, -1}; \
+  /* Restart from failure crash is too slow. Do not test negative coeff. */ \
+  int num_ceoff = 2; \
+  bool is_death_condition; \
+  for (int i = 0; i < num_ceoff; ++i) { \
+    LayerParameter layer_param; \
+    RegularizerParameter* reg_param = layer_param.add_regularizer(); \
+    reg_param->set_type(REG_TYPE(regularizer)); \
+    reg_param->set_coeff(coeff[i]); \
+    is_death_condition = coeff[i] < 0; \
+    this->Check(is_death_condition, layer_param, 1e-2, 5e-2, 1701); \
+  } \
+}
+
+TEST_REG_LOSS_LAYER_SINGLE_TYPE(CPU, L1);
+TEST_REG_LOSS_LAYER_SINGLE_TYPE(CPU, L2);
+TEST_REG_LOSS_LAYER_SINGLE_TYPE(CPU, MAX_NORM);
+
+TEST_REG_LOSS_LAYER_SINGLE_TYPE(GPU, L1);
+TEST_REG_LOSS_LAYER_SINGLE_TYPE(GPU, L2);
+TEST_REG_LOSS_LAYER_SINGLE_TYPE(GPU, MAX_NORM);
+
+#define TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(mode, regularizer_type_a, \
+    regularizer_type_b) \
+TYPED_TEST(RegularizationAsLossTest, \
+    TestGradient##mode##_##regularizer_type_a##_##regularizer_type_b) { \
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe"; \
+  Caffe::set_mode(Caffe::mode); \
+  TypeParam coeff[] = {1, 0, -1}; \
+  /* Restart from failure crash is too slow. Do not test negative coeff. */ \
+  int num_ceoff = 2; \
+  bool is_death_condition; \
+  for (int i = 0; i < num_ceoff; ++i) { \
+    for (int j = 0; j < num_ceoff; ++j) { \
+      LayerParameter layer_param; \
+      RegularizerParameter* reg_param; \
+      reg_param = layer_param.add_regularizer(); \
+      reg_param->set_type(REG_TYPE(regularizer_type_a)); \
+      reg_param->set_coeff(coeff[i]); \
+      reg_param = layer_param.add_regularizer(); \
+      reg_param->set_type(REG_TYPE(regularizer_type_b)); \
+      reg_param->set_coeff(coeff[j]); \
+      is_death_condition = coeff[i] < 0 || coeff[j] < 0; \
+      this->Check(is_death_condition, layer_param, 1e-2, 5e-2, 1701); \
+    } \
+  } \
+}
+
+TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(CPU, L1, L1);
+TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(CPU, L1, L2);
+TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(CPU, L1, MAX_NORM);
+
+TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(CPU, L2, L1);
+TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(CPU, L2, L2);
+TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(CPU, L2, MAX_NORM);
+
+TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(CPU, MAX_NORM, L1);
+TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(CPU, MAX_NORM, L2);
+TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(CPU, MAX_NORM, MAX_NORM);
+
+TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(GPU, L1, L1);
+TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(GPU, L1, L2);
+TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(GPU, L1, MAX_NORM);
+
+TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(GPU, L2, L1);
+TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(GPU, L2, L2);
+TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(GPU, L2, MAX_NORM);
+
+TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(GPU, MAX_NORM, L1);
+TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(GPU, MAX_NORM, L2);
+TEST_REGULARIZER_AS_LOSS_LAYER_TWO_TYPES(GPU, MAX_NORM, MAX_NORM);
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_stochastic_pooling.cpp b/src/caffe/test/test_stochastic_pooling.cpp
index d60d04e8df7..aedd6f3c2f2 100644
--- a/src/caffe/test/test_stochastic_pooling.cpp
+++ b/src/caffe/test/test_stochastic_pooling.cpp
@@ -146,8 +146,6 @@ TYPED_TEST(StochasticPoolingLayerTest, TestStochasticGPUTestPhase) {
   }
 }
 
-
-
 TYPED_TEST(StochasticPoolingLayerTest, TestGradientGPU) {
   Caffe::set_mode(Caffe::GPU);
   Caffe::set_phase(Caffe::TRAIN);
@@ -157,7 +155,7 @@ TYPED_TEST(StochasticPoolingLayerTest, TestGradientGPU) {
 
   layer_param.set_pool(LayerParameter_PoolMethod_STOCHASTIC);
   PoolingLayer<TypeParam> layer(layer_param);
-  GradientChecker<TypeParam> checker(1e-2, 1e-3);
+  GradientChecker<TypeParam> checker(1e-4, 1e-2);
   // it is too expensive to call curand multiple times, so we don't do an
   // exhaustive gradient check.
   checker.CheckGradient(&layer, &(this->blob_bottom_vec_),
diff --git a/src/caffe/test/test_util_blas.cpp b/src/caffe/test/test_util_blas.cpp
index 3f3ff8b3a69..57f4eafce7d 100644
--- a/src/caffe/test/test_util_blas.cpp
+++ b/src/caffe/test/test_util_blas.cpp
@@ -3,7 +3,6 @@
 #include <cstring>
 
 #include "cuda_runtime.h"
-#include "mkl.h"
 #include "cublas_v2.h"
 
 #include "gtest/gtest.h"
diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp
index 3ac69f9744e..053d7a40d44 100644
--- a/src/caffe/util/io.cpp
+++ b/src/caffe/util/io.cpp
@@ -142,4 +142,30 @@ void hdf5_load_nd_dataset<double>(hid_t file_id, const char* dataset_name_,
     file_id, dataset_name_, blob->mutable_cpu_data());
 }
 
+template <>
+void hdf5_save_nd_dataset<float>(
+    const hid_t file_id, const string dataset_name, const Blob<float>& blob) {
+  hsize_t dims[HDF5_NUM_DIMS];
+  dims[0] = blob.num();
+  dims[1] = blob.channels();
+  dims[2] = blob.height();
+  dims[3] = blob.width();
+  herr_t status = H5LTmake_dataset_float(
+      file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data());
+  CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name;
+}
+
+template <>
+void hdf5_save_nd_dataset<double>(
+    const hid_t file_id, const string dataset_name, const Blob<double>& blob) {
+  hsize_t dims[HDF5_NUM_DIMS];
+  dims[0] = blob.num();
+  dims[1] = blob.channels();
+  dims[2] = blob.height();
+  dims[3] = blob.width();
+  herr_t status = H5LTmake_dataset_double(
+      file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data());
+  CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name;
+}
+
 }  // namespace caffe
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 60656b87093..80e420f5689 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -1,9 +1,14 @@
-// Copyright 2013 Yangqing Jia
+// Copyright 2014 BVLC and contributors.
 
-#include <mkl.h>
+#include <boost/math/special_functions/next.hpp>
+#include <boost/random.hpp>
 #include <cublas_v2.h>
+
+#include <limits>
+
 #include "caffe/common.hpp"
 #include "caffe/util/math_functions.hpp"
+#include "caffe/util/rng.hpp"
 
 namespace caffe {
 
@@ -103,7 +108,6 @@ template <>
 void caffe_axpy<double>(const int N, const double alpha, const double* X,
     double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); }
 
-
 template <>
 void caffe_gpu_axpy<float>(const int N, const float alpha, const float* X,
     float* Y) {
@@ -116,18 +120,6 @@ void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
   CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1));
 }
 
-template <>
-void caffe_axpby<float>(const int N, const float alpha, const float* X,
-    const float beta, float* Y) {
-  cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
-}
-
-template <>
-void caffe_axpby<double>(const int N, const double alpha, const double* X,
-    const double beta, double* Y) {
-  cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
-}
-
 template <>
 void caffe_copy<float>(const int N, const float* X, float* Y) {
   cblas_scopy(N, X, 1, Y, 1);
@@ -183,82 +175,85 @@ void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
 }
 
 template <>
-void caffe_sqr<float>(const int n, const float* a, float* y) {
-  vsSqr(n, a, y);
+void caffe_cpu_axpby<float>(const int N, const float alpha, const float* X,
+                            const float beta, float* Y) {
+  cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
 }
 
 template <>
-void caffe_sqr<double>(const int n, const double* a, double* y) {
-  vdSqr(n, a, y);
+void caffe_cpu_axpby<double>(const int N, const double alpha, const double* X,
+                             const double beta, double* Y) {
+  cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
 }
 
 template <>
 void caffe_add<float>(const int n, const float* a, const float* b,
-    float* y) { vsAdd(n, a, b, y); }
+    float* y) {
+  vsAdd(n, a, b, y);
+}
 
 template <>
 void caffe_add<double>(const int n, const double* a, const double* b,
-    double* y) { vdAdd(n, a, b, y); }
+    double* y) {
+  vdAdd(n, a, b, y);
+}
 
 template <>
 void caffe_sub<float>(const int n, const float* a, const float* b,
-    float* y) { vsSub(n, a, b, y); }
+    float* y) {
+  vsSub(n, a, b, y);
+}
 
 template <>
 void caffe_sub<double>(const int n, const double* a, const double* b,
-    double* y) { vdSub(n, a, b, y); }
+    double* y) {
+  vdSub(n, a, b, y);
+}
 
 template <>
 void caffe_mul<float>(const int n, const float* a, const float* b,
-    float* y) { vsMul(n, a, b, y); }
+    float* y) {
+  vsMul(n, a, b, y);
+}
 
 template <>
 void caffe_mul<double>(const int n, const double* a, const double* b,
-    double* y) { vdMul(n, a, b, y); }
+    double* y) {
+  vdMul(n, a, b, y);
+}
 
 template <>
 void caffe_div<float>(const int n, const float* a, const float* b,
-    float* y) { vsDiv(n, a, b, y); }
+    float* y) {
+  vsDiv(n, a, b, y);
+}
 
 template <>
 void caffe_div<double>(const int n, const double* a, const double* b,
-    double* y) { vdDiv(n, a, b, y); }
+    double* y) {
+  vdDiv(n, a, b, y);
+}
 
 template <>
 void caffe_powx<float>(const int n, const float* a, const float b,
-    float* y) { vsPowx(n, a, b, y); }
-
-template <>
-void caffe_powx<double>(const int n, const double* a, const double b,
-    double* y) { vdPowx(n, a, b, y); }
-
-template <>
-void caffe_vRngUniform<float>(const int n, float* r,
-    const float a, const float b) {
-  VSL_CHECK(vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, Caffe::vsl_stream(),
-      n, r, a, b));
+    float* y) {
+  vsPowx(n, a, b, y);
 }
 
 template <>
-void caffe_vRngUniform<double>(const int n, double* r,
-    const double a, const double b) {
-  VSL_CHECK(vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, Caffe::vsl_stream(),
-      n, r, a, b));
+void caffe_powx<double>(const int n, const double* a, const double b,
+    double* y) {
+  vdPowx(n, a, b, y);
 }
 
 template <>
-void caffe_vRngGaussian<float>(const int n, float* r, const float a,
-    const float sigma) {
-  VSL_CHECK(vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
-      Caffe::vsl_stream(), n, r, a, sigma));
+void caffe_sqr<float>(const int n, const float* a, float* y) {
+  vsSqr(n, a, y);
 }
 
-
 template <>
-void caffe_vRngGaussian<double>(const int n, double* r, const double a,
-    const double sigma) {
-  VSL_CHECK(vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
-      Caffe::vsl_stream(), n, r, a, sigma));
+void caffe_sqr<double>(const int n, const double* a, double* y) {
+  vdSqr(n, a, y);
 }
 
 template <>
@@ -271,6 +266,86 @@ void caffe_exp<double>(const int n, const double* a, double* y) {
   vdExp(n, a, y);
 }
 
+template <typename Dtype>
+Dtype caffe_nextafter(const Dtype b) {
+  return boost::math::nextafter<Dtype>(
+      b, std::numeric_limits<Dtype>::max());
+}
+
+template
+float caffe_nextafter(const float b);
+
+template
+double caffe_nextafter(const double b);
+
+template <typename Dtype>
+void caffe_vRngUniform(const int n, Dtype* r,
+    const Dtype a, const Dtype b) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_LE(a, b);
+
+  boost::uniform_real<Dtype> random_distribution(
+      a, caffe_nextafter<Dtype>(b));
+  boost::variate_generator<caffe::rng_t,
+      boost::uniform_real<Dtype> > variate_generator(
+      caffe_rng(), random_distribution);
+
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
+  }
+}
+
+template
+void caffe_vRngUniform<float>(const int n, float* r,
+                                       const float a, const float b);
+template
+void caffe_vRngUniform<double>(const int n, double* r,
+                                       const double a, const double b);
+
+template <typename Dtype>
+void caffe_vRngGaussian(const int n, Dtype* r, const Dtype a,
+    const Dtype sigma) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GT(sigma, 0);
+  boost::normal_distribution<Dtype> random_distribution(a, sigma);
+  boost::variate_generator<caffe::rng_t,
+      boost::normal_distribution<Dtype> > variate_generator(
+      caffe_rng(), random_distribution);
+
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
+  }
+}
+
+template
+void caffe_vRngGaussian<float>(const int n, float* r, const float a,
+    const float sigma);
+
+template
+void caffe_vRngGaussian<double>(const int n, double* r, const double a,
+    const double sigma);
+
+template <typename Dtype>
+void caffe_vRngBernoulli(const int n, Dtype* r, const double p) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GE(p, 0);
+  CHECK_LE(p, 1);
+  boost::bernoulli_distribution<double> random_distribution(p);
+  boost::variate_generator<caffe::rng_t,
+      boost::bernoulli_distribution<double> > variate_generator(
+      caffe_rng(), random_distribution);
+
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
+  }
+}
+
+template
+void caffe_vRngBernoulli<int>(const int n, int* r, const double p);
+
 template <>
 float caffe_cpu_dot<float>(const int n, const float* x, const float* y) {
   return cblas_sdot(n, x, 1, y, 1);
@@ -293,4 +368,78 @@ void caffe_gpu_dot<double>(const int n, const double* x, const double* y,
   CUBLAS_CHECK(cublasDdot(Caffe::cublas_handle(), n, x, 1, y, 1, out));
 }
 
+template <>
+int caffe_hamming_distance<float>(const int n, const float* x,
+                                  const float* y) {
+  int dist = 0;
+  for (int i = 0; i < n; ++i) {
+    dist += __builtin_popcount(static_cast<uint32_t>(x[i]) ^
+                               static_cast<uint32_t>(y[i]));
+  }
+  return dist;
+}
+
+template <>
+int caffe_hamming_distance<double>(const int n, const double* x,
+                                   const double* y) {
+  int dist = 0;
+  for (int i = 0; i < n; ++i) {
+    dist += __builtin_popcountl(static_cast<uint64_t>(x[i]) ^
+                                static_cast<uint64_t>(y[i]));
+  }
+  return dist;
+}
+
+template <>
+float caffe_cpu_asum<float>(const int n, const float* x) {
+  return cblas_sasum(n, x, 1);
+}
+
+template <>
+double caffe_cpu_asum<double>(const int n, const double* x) {
+  return cblas_dasum(n, x, 1);
+}
+
+template <>
+void caffe_gpu_asum<float>(const int n, const float* x, float* y) {
+  CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y));
+}
+
+template <>
+void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
+  CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y));
+}
+
+INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sign);
+INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sgnbit);
+INSTANTIATE_CAFFE_CPU_UNARY_FUNC(fabs);
+
+template <>
+void caffe_cpu_scale<float>(const int n, const float alpha, const float *x,
+                            float* y) {
+  cblas_scopy(n, x, 1, y, 1);
+  cblas_sscal(n, alpha, y, 1);
+}
+
+template <>
+void caffe_cpu_scale<double>(const int n, const double alpha, const double *x,
+                             double* y) {
+  cblas_dcopy(n, x, 1, y, 1);
+  cblas_dscal(n, alpha, y, 1);
+}
+
+template <>
+void caffe_gpu_scale<float>(const int n, const float alpha, const float *x,
+                            float* y) {
+  CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1));
+  CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1));
+}
+
+template <>
+void caffe_gpu_scale<double>(const int n, const double alpha, const double *x,
+                             double* y) {
+  CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1));
+  CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1));
+}
+
 }  // namespace caffe
diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu
index 5491e246c48..85753aa567a 100644
--- a/src/caffe/util/math_functions.cu
+++ b/src/caffe/util/math_functions.cu
@@ -1,5 +1,7 @@
 // Copyright 2013 Yangqing Jia
+// Copyright 2014 kloudkl@github
 
+#include <math_functions.h>  // CUDA's, not caffe's, for fabs, signbit
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
@@ -33,5 +35,9 @@ void caffe_gpu_mul<double>(const int N, const double* a,
       N, a, b, y);
 }
 
+DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
+                                      - (x[index] < Dtype(0)));
+DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index]));
+DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(fabs, y[index] = fabs(x[index]));
 
 }  // namespace caffe
diff --git a/tools/extract_features.cpp b/tools/extract_features.cpp
new file mode 100644
index 00000000000..e547db594ba
--- /dev/null
+++ b/tools/extract_features.cpp
@@ -0,0 +1,173 @@
+// Copyright 2014 kloudkl@github
+
+#include <stdio.h>  // for snprintf
+#include <cuda_runtime.h>
+#include <google/protobuf/text_format.h>
+#include <leveldb/db.h>
+#include <leveldb/write_batch.h>
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/net.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/io.hpp"
+
+using namespace caffe;  // NOLINT(build/namespaces)
+
+template<typename Dtype>
+int feature_extraction_pipeline(int argc, char** argv);
+
+int main(int argc, char** argv) {
+  return feature_extraction_pipeline<float>(argc, argv);
+//  return feature_extraction_pipeline<double>(argc, argv);
+}
+
+template<typename Dtype>
+int feature_extraction_pipeline(int argc, char** argv) {
+  const int num_required_args = 6;
+  if (argc < num_required_args) {
+    LOG(ERROR)<<
+    "This program takes in a trained network and an input data layer, and then"
+    " extract features of the input data produced by the net.\n"
+    "Usage: demo_extract_features  pretrained_net_param"
+    "  feature_extraction_proto_file  extract_feature_blob_name"
+    "  save_feature_leveldb_name  num_mini_batches  [CPU/GPU]  [DEVICE_ID=0]";
+    return 1;
+  }
+  int arg_pos = num_required_args;
+
+  arg_pos = num_required_args;
+  if (argc > arg_pos && strcmp(argv[arg_pos], "GPU") == 0) {
+    LOG(ERROR)<< "Using GPU";
+    uint device_id = 0;
+    if (argc > arg_pos + 1) {
+      device_id = atoi(argv[arg_pos + 1]);
+      CHECK_GE(device_id, 0);
+    }
+    LOG(ERROR) << "Using Device_id=" << device_id;
+    Caffe::SetDevice(device_id);
+    Caffe::set_mode(Caffe::GPU);
+  } else {
+    LOG(ERROR) << "Using CPU";
+    Caffe::set_mode(Caffe::CPU);
+  }
+  Caffe::set_phase(Caffe::TEST);
+
+  NetParameter pretrained_net_param;
+
+  arg_pos = 0;  // the name of the executable
+  string pretrained_binary_proto(argv[++arg_pos]);
+  ReadProtoFromBinaryFile(pretrained_binary_proto.c_str(),
+                          &pretrained_net_param);
+
+  // Expected prototxt contains at least one data layer such as
+  //  the layer data_layer_name and one feature blob such as the
+  //  fc7 top blob to extract features.
+  /*
+   layers {
+   layer {
+   name: "data_layer_name"
+   type: "data"
+   source: "/path/to/your/images/to/extract/feature/images_leveldb"
+   meanfile: "/path/to/your/image_mean.binaryproto"
+   batchsize: 128
+   cropsize: 227
+   mirror: false
+   }
+   top: "data_blob_name"
+   top: "label_blob_name"
+   }
+   layers {
+   layer {
+   name: "drop7"
+   type: "dropout"
+   dropout_ratio: 0.5
+   }
+   bottom: "fc7"
+   top: "fc7"
+   }
+   */
+  NetParameter feature_extraction_net_param;
+  string feature_extraction_proto(argv[++arg_pos]);
+  ReadProtoFromTextFile(feature_extraction_proto,
+                        &feature_extraction_net_param);
+  shared_ptr<Net<Dtype> > feature_extraction_net(
+      new Net<Dtype>(feature_extraction_net_param));
+  feature_extraction_net->CopyTrainedLayersFrom(pretrained_net_param);
+
+  string extract_feature_blob_name(argv[++arg_pos]);
+  CHECK(feature_extraction_net->has_blob(extract_feature_blob_name))
+      << "Unknown feature blob name " << extract_feature_blob_name
+      << " in the network " << feature_extraction_proto;
+
+  string save_feature_leveldb_name(argv[++arg_pos]);
+  leveldb::DB* db;
+  leveldb::Options options;
+  options.error_if_exists = true;
+  options.create_if_missing = true;
+  options.write_buffer_size = 268435456;
+  LOG(INFO)<< "Opening leveldb " << save_feature_leveldb_name;
+  leveldb::Status status = leveldb::DB::Open(options,
+                                             save_feature_leveldb_name.c_str(),
+                                             &db);
+  CHECK(status.ok()) << "Failed to open leveldb " << save_feature_leveldb_name;
+
+  int num_mini_batches = atoi(argv[++arg_pos]);
+
+  LOG(ERROR)<< "Extacting Features";
+
+  Datum datum;
+  leveldb::WriteBatch* batch = new leveldb::WriteBatch();
+  const int kMaxKeyStrLength = 100;
+  char key_str[kMaxKeyStrLength];
+  int num_bytes_of_binary_code = sizeof(Dtype);
+  vector<Blob<float>*> input_vec;
+  int image_index = 0;
+  for (int batch_index = 0; batch_index < num_mini_batches; ++batch_index) {
+    feature_extraction_net->Forward(input_vec);
+    const shared_ptr<Blob<Dtype> > feature_blob = feature_extraction_net
+        ->blob_by_name(extract_feature_blob_name);
+    int num_features = feature_blob->num();
+    int dim_features = feature_blob->count() / num_features;
+    Dtype* feature_blob_data;
+    for (int n = 0; n < num_features; ++n) {
+      datum.set_height(dim_features);
+      datum.set_width(1);
+      datum.set_channels(1);
+      datum.clear_data();
+      datum.clear_float_data();
+      feature_blob_data = feature_blob->mutable_cpu_data() +
+          feature_blob->offset(n);
+      for (int d = 0; d < dim_features; ++d) {
+        datum.add_float_data(feature_blob_data[d]);
+      }
+      string value;
+      datum.SerializeToString(&value);
+      snprintf(key_str, kMaxKeyStrLength, "%d", image_index);
+      batch->Put(string(key_str), value);
+      ++image_index;
+      if (image_index % 1000 == 0) {
+        db->Write(leveldb::WriteOptions(), batch);
+        LOG(ERROR)<< "Extracted features of " << image_index <<
+            " query images.";
+        delete batch;
+        batch = new leveldb::WriteBatch();
+      }
+    }  // for (int n = 0; n < num_features; ++n)
+  }  // for (int batch_index = 0; batch_index < num_mini_batches; ++batch_index)
+  // write the last batch
+  if (image_index % 1000 != 0) {
+    db->Write(leveldb::WriteOptions(), batch);
+    LOG(ERROR)<< "Extracted features of " << image_index <<
+        " query images.";
+  }
+
+  delete batch;
+  delete db;
+  LOG(ERROR)<< "Successfully extracted the features!";
+  return 0;
+}
+
diff --git a/tools/net_speed_benchmark.cpp b/tools/net_speed_benchmark.cpp
index 96d40a2eb37..43f7b493671 100644
--- a/tools/net_speed_benchmark.cpp
+++ b/tools/net_speed_benchmark.cpp
@@ -58,9 +58,11 @@ int main(int argc, char** argv) {
   LOG(ERROR) << "Performing Forward";
   // Note that for the speed benchmark, we will assume that the network does
   // not take any input blobs.
-  caffe_net.Forward(vector<Blob<float>*>());
+  float initial_loss;
+  caffe_net.Forward(vector<Blob<float>*>(), &initial_loss);
+  LOG(ERROR) << "Initial loss: " << initial_loss;
   LOG(ERROR) << "Performing Backward";
-  LOG(ERROR) << "Initial loss: " << caffe_net.Backward();
+  caffe_net.Backward();
 
   const vector<shared_ptr<Layer<float> > >& layers = caffe_net.layers();
   vector<vector<Blob<float>*> >& bottom_vecs = caffe_net.bottom_vecs();