diff --git a/Makefile b/Makefile
index 77e2ff5edbe..41d594851f0 100644
--- a/Makefile
+++ b/Makefile
@@ -224,6 +224,14 @@ endif
 INCLUDE_DIRS += $(BLAS_INCLUDE)
 LIBRARY_DIRS += $(BLAS_LIB)
 
+OPENCL ?= 0
+ifeq ($(OPENCL), 1)
+	INCLUDE_DIRS += $(OPENCL_INCLUDE_DIR) $(CLBLAS_INCLUDE_DIR)
+	LIBRARY_DIRS += $(OPENCL_LIB_DIR) $(CLBLAS_LIB_DIR)
+	LIBRARIES += $(OPENCL_LIBS) $(CLBLAS_LIBS)
+	COMMON_FLAGS += -DUSE_OPENCL
+endif
+
 # Complete build flags.
 COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
 CXXFLAGS += -pthread -fPIC $(COMMON_FLAGS) $(WARNINGS)
diff --git a/Makefile.config.example b/Makefile.config.example
index 73c3740b1c7..4bb904c54b4 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -46,6 +46,13 @@ PYTHON_INCLUDE := /usr/local/include/python2.7 \
 PYTHON_LIB := /usr/local/lib
 # PYTHON_LIB := $(HOME)/anaconda/lib
 
+OPENCL_INCLUDE_DIR := /opt/AMDAPP/include/
+OPENCL_LIB_DIR := /opt/AMDAPP/lib/x86_64/
+OPENCL_LIBS := OpenCL
+CLBLAS_INCLUDE_DIR :=  /home/user/Codes/clBLAS/src/package/include
+CLBLAS_LIB_DIR := /home/user/Codes/clBLAS/src/package/lib64
+CLBLAS_LIBS := clBLAS
+
 # Whatever else you find you need goes here.
 INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include
 LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib
diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index c04375a10e2..30593388347 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -71,6 +71,12 @@ class Blob {
   Dtype* mutable_gpu_data();
   Dtype* mutable_cpu_diff();
   Dtype* mutable_gpu_diff();
+
+  const Dtype* const_data() const;
+  const Dtype* const_diff() const;
+  Dtype* mutable_data();
+  Dtype* mutable_diff();
+
   void Update();
   void FromProto(const BlobProto& proto);
   void ToProto(BlobProto* proto, bool write_diff = false) const;
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index bd4e39f136d..e25ad38d022 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -74,7 +74,7 @@ class Caffe {
     }
     return *singleton_;
   }
-  enum Brew { CPU, GPU };
+  enum Brew { CPU, GPU, OPENCL_CPU, OPENCL_GPU, OPENCL_ALL };
   enum Phase { TRAIN, TEST };
 
 
diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp
index 2c6be551d8f..45112af72bc 100644
--- a/include/caffe/data_layers.hpp
+++ b/include/caffe/data_layers.hpp
@@ -31,6 +31,10 @@ class HDF5OutputLayer : public Layer<Dtype> {
   virtual ~HDF5OutputLayer();
   virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {}
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) { return; }
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_HDF5_OUTPUT;
@@ -42,14 +46,6 @@ class HDF5OutputLayer : public Layer<Dtype> {
   inline std::string file_name() const { return file_name_; }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
   virtual void SaveBlobs();
 
   std::string file_name_;
@@ -67,6 +63,10 @@ class HDF5DataLayer : public Layer<Dtype> {
   virtual ~HDF5DataLayer();
   virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) { return; }
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_HDF5_DATA;
@@ -75,14 +75,6 @@ class HDF5DataLayer : public Layer<Dtype> {
   virtual inline int ExactNumTopBlobs() const { return 2; }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {}
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {}
   virtual void LoadHDF5FileData(const char* filename);
 
   std::vector<std::string> hdf_filenames_;
@@ -111,6 +103,10 @@ class DataLayer : public Layer<Dtype> {
   virtual ~DataLayer();
   virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) { return; }
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_DATA;
@@ -120,15 +116,6 @@ class DataLayer : public Layer<Dtype> {
   virtual inline int MaxTopBlobs() const { return 2; }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {}
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {}
-
   virtual void CreatePrefetchThread();
   virtual void JoinPrefetchThread();
   virtual unsigned int PrefetchRand();
@@ -170,15 +157,12 @@ class DummyDataLayer : public Layer<Dtype> {
   }
   virtual inline int ExactNumBottomBlobs() const { return 0; }
   virtual inline int MinTopBlobs() const { return 1; }
-
- protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {}
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {}
 
+ protected:
   vector<shared_ptr<Filler<Dtype> > > fillers_;
   vector<bool> refill_;
 };
@@ -198,6 +182,10 @@ class ImageDataLayer : public Layer<Dtype> {
   virtual ~ImageDataLayer();
   virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) { return; }
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_IMAGE_DATA;
@@ -206,15 +194,6 @@ class ImageDataLayer : public Layer<Dtype> {
   virtual inline int ExactNumTopBlobs() const { return 2; }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {}
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {}
-
   virtual void ShuffleImages();
 
   virtual void CreatePrefetchThread();
@@ -244,6 +223,10 @@ class MemoryDataLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {}
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_MEMORY_DATA;
@@ -260,13 +243,6 @@ class MemoryDataLayer : public Layer<Dtype> {
   int batch_size() { return batch_size_; }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {}
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {}
-
   Dtype* data_;
   Dtype* labels_;
   int datum_channels_;
@@ -293,6 +269,10 @@ class WindowDataLayer : public Layer<Dtype> {
   virtual ~WindowDataLayer();
   virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) { return; }
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_WINDOW_DATA;
@@ -301,15 +281,6 @@ class WindowDataLayer : public Layer<Dtype> {
   virtual inline int ExactNumTopBlobs() const { return 2; }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {}
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {}
-
   virtual void CreatePrefetchThread();
   virtual void JoinPrefetchThread();
   virtual unsigned int PrefetchRand();
diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index 242f11a3513..acf34270f6a 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -34,7 +34,7 @@ class ConstantFiller : public Filler<Dtype> {
   explicit ConstantFiller(const FillerParameter& param)
       : Filler<Dtype>(param) {}
   virtual void Fill(Blob<Dtype>* blob) {
-    Dtype* data = blob->mutable_cpu_data();
+    Dtype* data = blob->mutable_data();
     const int count = blob->count();
     const Dtype value = this->filler_param_.value();
     CHECK(count);
@@ -54,7 +54,7 @@ class UniformFiller : public Filler<Dtype> {
   virtual void Fill(Blob<Dtype>* blob) {
     CHECK(blob->count());
     caffe_rng_uniform<Dtype>(blob->count(), Dtype(this->filler_param_.min()),
-        Dtype(this->filler_param_.max()), blob->mutable_cpu_data());
+        Dtype(this->filler_param_.max()), blob->mutable_data());
     CHECK_EQ(this->filler_param_.sparse(), -1)
          << "Sparsity not supported by this Filler.";
   }
@@ -66,10 +66,10 @@ class GaussianFiller : public Filler<Dtype> {
   explicit GaussianFiller(const FillerParameter& param)
       : Filler<Dtype>(param) {}
   virtual void Fill(Blob<Dtype>* blob) {
-    Dtype* data = blob->mutable_cpu_data();
+    Dtype* data = blob->mutable_data();
     CHECK(blob->count());
     caffe_rng_gaussian<Dtype>(blob->count(), Dtype(this->filler_param_.mean()),
-        Dtype(this->filler_param_.std()), blob->mutable_cpu_data());
+        Dtype(this->filler_param_.std()), blob->mutable_data());
     int sparse = this->filler_param_.sparse();
     CHECK_GE(sparse, -1);
     if (sparse >= 0) {
@@ -82,7 +82,7 @@ class GaussianFiller : public Filler<Dtype> {
       int num_inputs = blob->height();
       Dtype non_zero_probability = Dtype(sparse) / Dtype(num_inputs);
       rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int)));
-      int* mask = reinterpret_cast<int*>(rand_vec_->mutable_cpu_data());
+      int* mask = reinterpret_cast<int*>(rand_vec_->mutable_data());
       caffe_rng_bernoulli(blob->count(), non_zero_probability, mask);
       for (int i = 0; i < blob->count(); ++i) {
         data[i] *= mask[i];
@@ -100,9 +100,9 @@ class PositiveUnitballFiller : public Filler<Dtype> {
   explicit PositiveUnitballFiller(const FillerParameter& param)
       : Filler<Dtype>(param) {}
   virtual void Fill(Blob<Dtype>* blob) {
-    Dtype* data = blob->mutable_cpu_data();
+    Dtype* data = blob->mutable_data();
     DCHECK(blob->count());
-    caffe_rng_uniform<Dtype>(blob->count(), 0, 1, blob->mutable_cpu_data());
+    caffe_rng_uniform<Dtype>(blob->count(), 0, 1, blob->mutable_data());
     // We expect the filler to not be called very frequently, so we will
     // just use a simple implementation
     int dim = blob->count() / blob->num();
@@ -139,7 +139,7 @@ class XavierFiller : public Filler<Dtype> {
     int fan_in = blob->count() / blob->num();
     Dtype scale = sqrt(Dtype(3) / fan_in);
     caffe_rng_uniform<Dtype>(blob->count(), -scale, scale,
-        blob->mutable_cpu_data());
+        blob->mutable_data());
     CHECK_EQ(this->filler_param_.sparse(), -1)
          << "Sparsity not supported by this Filler.";
   }
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index 690c36ba23f..47dddb6ad22 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -9,6 +9,7 @@
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
 #include "caffe/proto/caffe.pb.h"
+#include "caffe/util/device.hpp"
 
 using std::string;
 using std::vector;
@@ -43,9 +44,9 @@ class Layer {
   // Forward and backward wrappers. You should implement the cpu and
   // gpu specific implementations instead, and should not change these
   // functions.
-  inline Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  inline void Backward(const vector<Blob<Dtype>*>& top,
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down,
       vector<Blob<Dtype>*>* bottom);
 
@@ -101,7 +102,7 @@ class Layer {
   // Forward functions: compute the layer output
   // (and loss layers return the loss; other layers return the dummy value 0.)
   virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) = 0;
+      vector<Blob<Dtype>*>* top) { return static_cast<Dtype>(0); }
   // If no gpu code is provided, we will simply use cpu code.
   virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
@@ -113,7 +114,7 @@ class Layer {
   // for the bottom blobs if propagate_down is true.
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down,
-      vector<Blob<Dtype>*>* bottom) = 0;
+      vector<Blob<Dtype>*>* bottom) { return; }
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down,
       vector<Blob<Dtype>*>* bottom) {
@@ -165,7 +166,7 @@ class Layer {
 // gpu specific implementations instead, and should not change these
 // functions.
 template <typename Dtype>
-inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
+Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   switch (Caffe::mode()) {
   case Caffe::CPU:
@@ -179,7 +180,7 @@ inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
+void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   switch (Caffe::mode()) {
diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
index 3a4d41662fd..ab8a9caa5f0 100644
--- a/include/caffe/loss_layers.hpp
+++ b/include/caffe/loss_layers.hpp
@@ -63,6 +63,10 @@ class SoftmaxWithLossLayer : public Layer<Dtype> {
       : Layer<Dtype>(param), softmax_layer_(new SoftmaxLayer<Dtype>(param)) {}
   virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_SOFTMAX_LOSS;
@@ -75,15 +79,6 @@ class SoftmaxWithLossLayer : public Layer<Dtype> {
   }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-
   shared_ptr<SoftmaxLayer<Dtype> > softmax_layer_;
   // prob stores the output probability of the layer.
   Blob<Dtype> prob_;
@@ -103,21 +98,16 @@ class SigmoidCrossEntropyLossLayer : public LossLayer<Dtype> {
           sigmoid_output_(new Blob<Dtype>()) {}
   virtual void FurtherSetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_SIGMOID_CROSS_ENTROPY_LOSS;
   }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-
   shared_ptr<SigmoidLayer<Dtype> > sigmoid_layer_;
   // sigmoid_output stores the output of the sigmoid layer.
   shared_ptr<Blob<Dtype> > sigmoid_output_;
@@ -139,6 +129,10 @@ class EuclideanLossLayer : public LossLayer<Dtype> {
       : LossLayer<Dtype>(param), diff_() {}
   virtual void FurtherSetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_EUCLIDEAN_LOSS;
@@ -150,15 +144,6 @@ class EuclideanLossLayer : public LossLayer<Dtype> {
   }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-
   Blob<Dtype> diff_;
 };
 
@@ -171,17 +156,16 @@ class InfogainLossLayer : public LossLayer<Dtype> {
       : LossLayer<Dtype>(param), infogain_() {}
   virtual void FurtherSetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_INFOGAIN_LOSS;
   }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-
   Blob<Dtype> infogain_;
 };
 
@@ -196,11 +180,9 @@ class HingeLossLayer : public LossLayer<Dtype> {
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_HINGE_LOSS;
   }
-
- protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 };
 
@@ -213,16 +195,14 @@ class MultinomialLogisticLossLayer : public LossLayer<Dtype> {
       : LossLayer<Dtype>(param) {}
   virtual void FurtherSetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS;
   }
-
- protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 };
 
 /* AccuracyLayer
@@ -236,6 +216,12 @@ class AccuracyLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
+    NOT_IMPLEMENTED;
+  }
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_ACCURACY;
@@ -245,13 +231,6 @@ class AccuracyLayer : public Layer<Dtype> {
   virtual inline int ExactNumTopBlobs() const { return 1; }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
-    NOT_IMPLEMENTED;
-  }
-
   int top_k_;
 };
 
diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp
index e52e395e24b..16c8ae64835 100644
--- a/include/caffe/neuron_layers.hpp
+++ b/include/caffe/neuron_layers.hpp
@@ -120,21 +120,16 @@ class PowerLayer : public NeuronLayer<Dtype> {
       : NeuronLayer<Dtype>(param) {}
   virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_POWER;
   }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-
   Dtype power_;
   Dtype scale_;
   Dtype shift_;
diff --git a/include/caffe/opencl_syncedmem.hpp b/include/caffe/opencl_syncedmem.hpp
new file mode 100644
index 00000000000..22d045ce3d4
--- /dev/null
+++ b/include/caffe/opencl_syncedmem.hpp
@@ -0,0 +1,77 @@
+// Copyright 2014 BVLC and contributors.
+
+#ifdef USE_OPENCL
+#ifndef CAFFE_OPENCL_SYNCEDMEM_HPP_
+#define CAFFE_OPENCL_SYNCEDMEM_HPP_
+
+#include <cstdlib>
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/opencl.h>
+#endif
+
+#include "caffe/common.hpp"
+#include "caffe/syncedmem.hpp"
+#include "caffe/util/opencl_device.hpp"
+
+namespace caffe {
+
+
+/*
+ * https://software.intel.com/sites/products/documentation/ioclsdk/2013/OG/Mapping_Memory_Objects_(USE_HOST_PTR).htm
+ * For efficiency reasons such a host-side pointer must be allocated for the
+ *   conditions:
+ * * The amount of memory you allocate and the size of the corresponding
+ * *   OpenCL* buffer must be multiple of the cache line sizes (64 bytes).
+ * * Always use 4k alignment (page alignment) when you allocate the host memory
+ * *   for sharing with OpenCL devices.
+ */
+#define OPENCL_CACHE_LINE_SIZE 64
+#define OPENCL_PAGE_ALIGNMENT 4096
+
+inline void opencl_aligned_malloc(void** ptr, size_t* size) {
+  *size +=  (*size % OPENCL_CACHE_LINE_SIZE);
+#ifdef _MSC_VER
+  *ptr = _aligned_malloc(*size, OPENCL_PAGE_ALIGNMENT);
+#else
+  if(posix_memalign(ptr, OPENCL_PAGE_ALIGNMENT, *size)) {
+    *ptr = NULL;
+  }
+#endif
+}
+
+inline void opencl_aligned_free(void* ptr) {
+#ifdef _MSC_VER
+  _aligned_free(ptr);
+#else
+  free(ptr);
+#endif
+}
+
+class OpenCLSyncedMemory : public AbstractSyncedMemory {
+ public:
+  OpenCLSyncedMemory() : AbstractSyncedMemory() {}
+  explicit OpenCLSyncedMemory(size_t size) : AbstractSyncedMemory(size) {}
+  ~OpenCLSyncedMemory();
+  const void* cpu_data();
+  void set_cpu_data(void* data);
+  const void* gpu_data();
+  void* mutable_cpu_data();
+  void* mutable_gpu_data();
+ protected:
+  void to_cpu();
+  void to_gpu();
+
+ private:
+  void* shared_host_ptr_;
+  void* mapped_device_ptr_;
+  cl_mem device_mem_;
+
+  DISABLE_COPY_AND_ASSIGN(OpenCLSyncedMemory);
+};  // class OpenCLSyncedMemory
+
+}  // namespace caffe
+
+#endif  // CAFFE_OPENCL_SYNCEDMEM_HPP_
+#endif  // USE_OPENCL
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index bed55c3806e..30e7d771ad4 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -31,33 +31,56 @@ inline void CaffeFreeHost(void* ptr) {
   free(ptr);
 }
 
-
-class SyncedMemory {
+class AbstractSyncedMemory {
  public:
-  SyncedMemory()
+  AbstractSyncedMemory()
       : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
         own_cpu_data_(false) {}
-  explicit SyncedMemory(size_t size)
+  explicit AbstractSyncedMemory(size_t size)
       : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
         own_cpu_data_(false) {}
-  ~SyncedMemory();
+  virtual ~AbstractSyncedMemory() {}
+  enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
+  virtual const void* cpu_data() = 0;
+  virtual void set_cpu_data(void* data) = 0;
+  virtual const void* gpu_data() = 0;
+  virtual void* mutable_cpu_data() = 0;
+  virtual void* mutable_gpu_data() = 0;
+  virtual SyncedHead head() { return head_; }
+  virtual size_t size() { return size_; }
+
+  virtual const void* const_data() { return NULL; }
+  virtual void* mutable_data() { return NULL;}
+
+ protected:
+  virtual void to_cpu() = 0;
+  virtual void to_gpu() = 0;
+
+ protected:
+  void* cpu_ptr_;
+  void* gpu_ptr_;
+  size_t size_;
+  SyncedHead head_;
+  bool own_cpu_data_;
+};
+
+class SyncedMemory : public AbstractSyncedMemory {
+ public:
+  SyncedMemory() : AbstractSyncedMemory() {}
+  explicit SyncedMemory(size_t size) : AbstractSyncedMemory(size) {}
+  virtual ~SyncedMemory();
   const void* cpu_data();
   void set_cpu_data(void* data);
   const void* gpu_data();
   void* mutable_cpu_data();
   void* mutable_gpu_data();
-  enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
-  SyncedHead head() { return head_; }
-  size_t size() { return size_; }
 
- private:
+  const void* const_data();
+  void* mutable_data();
+
+ protected:
   void to_cpu();
   void to_gpu();
-  void* cpu_ptr_;
-  void* gpu_ptr_;
-  size_t size_;
-  SyncedHead head_;
-  bool own_cpu_data_;
 
   DISABLE_COPY_AND_ASSIGN(SyncedMemory);
 };  // class SyncedMemory
diff --git a/include/caffe/syncedmem_factory.hpp b/include/caffe/syncedmem_factory.hpp
new file mode 100644
index 00000000000..08ff2f0da09
--- /dev/null
+++ b/include/caffe/syncedmem_factory.hpp
@@ -0,0 +1,17 @@
+// Copyright 2014 BVLC and contributors.
+
+#ifndef CAFFE_SYNCEDMEM_FACTORY_HPP_
+#define CAFFE_SYNCEDMEM_FACTORY_HPP_
+
+#include "caffe/common.hpp"
+#include "caffe/syncedmem.hpp"
+#include "caffe/opencl_syncedmem.hpp"
+
+namespace caffe {
+
+// The SyncedMemory factory function
+AbstractSyncedMemory* GetSyncedMemory(const size_t size = 0);
+
+}  // namespace caffe
+
+#endif  // CAFFE_SYNCEDMEM_FACTORY_HPP_
diff --git a/include/caffe/util/device.hpp b/include/caffe/util/device.hpp
new file mode 100644
index 00000000000..ef3e9d5cdbd
--- /dev/null
+++ b/include/caffe/util/device.hpp
@@ -0,0 +1,269 @@
+// Copyright 2014 BVLC and contributors.
+
+#ifndef CAFFE_UTIL_DEVICE_H_
+#define CAFFE_UTIL_DEVICE_H_
+
+#include <cublas_v2.h>
+#include <stdint.h>
+
+#include "glog/logging.h"
+
+#include "caffe/util/im2col.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template<typename Dtype>
+class Device {
+ public:
+  virtual ~Device() {
+  }
+  virtual void gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+                    const int M, const int N, const int K, const Dtype alpha,
+                    const Dtype* A, const Dtype* B, const Dtype beta,
+                    Dtype* C) { NOT_IMPLEMENTED; }
+
+  virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+                    const Dtype alpha, const Dtype* A, const Dtype* x,
+                    const Dtype beta, Dtype* y) { NOT_IMPLEMENTED; }
+
+  virtual void axpy(const int N, const Dtype alpha, const Dtype* X,
+                    Dtype* Y) { NOT_IMPLEMENTED; }
+
+  virtual void axpby(const int N, const Dtype alpha, const Dtype* X,
+                     const Dtype beta, Dtype* Y) { NOT_IMPLEMENTED; }
+
+  virtual void copy(const int N, const Dtype *X, Dtype *Y) { NOT_IMPLEMENTED; }
+  virtual void copy_from_cpu(const int N, const Dtype* X, Dtype* Y) {
+    NOT_IMPLEMENTED; }
+
+  virtual void set(const int N, const Dtype alpha, Dtype *X) {
+    NOT_IMPLEMENTED; }
+
+  virtual void add_scalar(const int N, const Dtype alpha, Dtype *X) {
+    NOT_IMPLEMENTED; }
+
+  virtual void scal(const int N, const Dtype alpha, Dtype *X) {
+    NOT_IMPLEMENTED; }
+
+  virtual void sqr(const int N, const Dtype* a, Dtype* y) { NOT_IMPLEMENTED; }
+
+  virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y) {
+    NOT_IMPLEMENTED; }
+
+  virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y) {
+    NOT_IMPLEMENTED; }
+
+  virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y) {
+    NOT_IMPLEMENTED; }
+
+  virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y) {
+    NOT_IMPLEMENTED; }
+
+  virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y) {
+    NOT_IMPLEMENTED; }
+
+  virtual void rng_uniform(const int N, const Dtype a, const Dtype b,
+                           Dtype* r) { NOT_IMPLEMENTED; }
+
+  virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma,
+                            Dtype* r) { NOT_IMPLEMENTED; }
+
+  virtual void rng_bernoulli(const int N, const Dtype p, int* r) {
+    NOT_IMPLEMENTED; }
+
+  virtual void exp(const int N, const Dtype* a, Dtype* y) { NOT_IMPLEMENTED; }
+
+  virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out) {
+    NOT_IMPLEMENTED; }
+
+  virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y,
+                                uint32_t* out) { NOT_IMPLEMENTED; }
+
+// Returns the sum of the absolute values of the elements of vector x
+  virtual void asum(const int N, const Dtype* x, Dtype* y) { NOT_IMPLEMENTED; }
+
+  virtual void sign(const int N, const Dtype* x, Dtype* y) { NOT_IMPLEMENTED; }
+
+  virtual void sgnbit(const int N, const Dtype* x, Dtype* y) {
+    NOT_IMPLEMENTED; }
+
+  virtual void fabs(const int N, const Dtype* x, Dtype* y) { NOT_IMPLEMENTED; }
+
+  virtual void scale(const int N, const Dtype alpha, const Dtype *x,
+                     Dtype* y) { NOT_IMPLEMENTED; }
+
+  virtual void im2col(const Dtype* data_im, const int channels,
+      const int height, const int width, const int ksize, const int pad,
+      const int stride, Dtype* data_col) { NOT_IMPLEMENTED; }
+
+  virtual void col2im(const Dtype* data_col, const int channels,
+      const int height, const int width, const int psize, const int pad,
+      const int stride, Dtype* data_im) { NOT_IMPLEMENTED; }
+};
+
+template<typename Dtype>
+class CPUDevice : public Device<Dtype> {
+ public:
+  CPUDevice() {
+  }
+  virtual ~CPUDevice() {
+  }
+  virtual void gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+                    const int M, const int N, const int K, const Dtype alpha,
+                    const Dtype* A, const Dtype* B, const Dtype beta, Dtype* C);
+
+  virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+                    const Dtype alpha, const Dtype* A, const Dtype* x,
+                    const Dtype beta, Dtype* y);
+
+  virtual void axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y);
+
+  virtual void axpby(const int N, const Dtype alpha, const Dtype* X,
+                     const Dtype beta, Dtype* Y);
+
+  virtual void copy(const int N, const Dtype *X, Dtype *Y);
+  virtual void copy_from_cpu(const int N, const Dtype* X, Dtype* Y);
+
+  virtual void set(const int N, const Dtype alpha, Dtype *X);
+
+  virtual void add_scalar(const int N, const Dtype alpha, Dtype *X);
+
+  virtual void scal(const int N, const Dtype alpha, Dtype *X);
+
+  virtual void sqr(const int N, const Dtype* a, Dtype* y);
+
+  virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+  virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+  virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+  virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+  virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y);
+
+  virtual void rng_uniform(const int N, const Dtype a, const Dtype b, Dtype* r);
+
+  virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma,
+                            Dtype* r);
+
+  virtual void rng_bernoulli(const int N, const Dtype p, int* r);
+
+  virtual void exp(const int N, const Dtype* a, Dtype* y);
+
+  virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out);
+
+  virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y,
+                                uint32_t* out);
+
+// Returns the sum of the absolute values of the elements of vector x
+  virtual void asum(const int N, const Dtype* x, Dtype* y);
+
+  virtual void sign(const int N, const Dtype* x, Dtype* y);
+
+  virtual void sgnbit(const int N, const Dtype* x, Dtype* y);
+
+  virtual void fabs(const int N, const Dtype* x, Dtype* y);
+
+  virtual void scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y);
+
+  virtual void im2col(const Dtype* data_im, const int channels,
+      const int height, const int width, const int ksize, const int pad,
+      const int stride, Dtype* data_col);
+
+  virtual void col2im(const Dtype* data_col, const int channels,
+      const int height, const int width, const int psize, const int pad,
+      const int stride, Dtype* data_im);
+};
+
+template<typename Dtype>
+class GPUDevice : public Device<Dtype> {
+ public:
+  GPUDevice() {
+  }
+  virtual ~GPUDevice() {
+  }
+  virtual void gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+                    const int M, const int N, const int K, const Dtype alpha,
+                    const Dtype* A, const Dtype* B, const Dtype beta, Dtype* C);
+
+  virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+                    const Dtype alpha, const Dtype* A, const Dtype* x,
+                    const Dtype beta, Dtype* y);
+
+  virtual void axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y);
+
+  virtual void axpby(const int N, const Dtype alpha, const Dtype* X,
+                     const Dtype beta, Dtype* Y);
+
+  virtual void copy(const int N, const Dtype *X, Dtype *Y);
+  virtual void copy_from_cpu(const int N, const Dtype* X, Dtype* Y);
+
+  virtual void set(const int N, const Dtype alpha, Dtype *X);
+
+  virtual void add_scalar(const int N, const Dtype alpha, Dtype *X);
+
+  virtual void scal(const int N, const Dtype alpha, Dtype *X);
+
+  virtual void sqr(const int N, const Dtype* a, Dtype* y);
+
+  virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+  virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+  virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+  virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+  virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y);
+
+  virtual void rng_uniform(const int N, const Dtype a, const Dtype b, Dtype* r);
+
+  virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma,
+                            Dtype* r);
+
+  virtual void rng_bernoulli(const int N, const Dtype p, int* r);
+
+  virtual void exp(const int N, const Dtype* a, Dtype* y);
+
+  virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out);
+
+  virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y,
+                                uint32_t* out);
+
+// Returns the sum of the absolute values of the elements of vector x
+  virtual void asum(const int N, const Dtype* x, Dtype* y);
+
+  virtual void sign(const int N, const Dtype* x, Dtype* y);
+
+  virtual void sgnbit(const int N, const Dtype* x, Dtype* y);
+
+  virtual void fabs(const int N, const Dtype* x, Dtype* y);
+
+  virtual void scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y);
+
+  virtual void im2col(const Dtype* data_im, const int channels,
+      const int height, const int width, const int ksize, const int pad,
+      const int stride, Dtype* data_col);
+
+  virtual void col2im(const Dtype* data_col, const int channels,
+      const int height, const int width, const int psize, const int pad,
+      const int stride, Dtype* data_im);
+};
+
+template<typename Dtype>
+class DeviceFactory {
+ public:
+  static Device<Dtype>* GetDevice();
+ private:
+  static Device<Dtype>* cpu_device_;
+  static Device<Dtype>* gpu_device_;
+#ifdef USE_OPENCL
+  static Device<Dtype>* opencl_device_;
+#endif
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_UTIL_DEVICE_H_
diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index a649d8cc4e8..809308caf80 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -3,6 +3,8 @@
 #ifndef _CAFFE_UTIL_IM2COL_HPP_
 #define _CAFFE_UTIL_IM2COL_HPP_
 
+#include "caffe/common.hpp"
+
 namespace caffe {
 
 template <typename Dtype>
diff --git a/include/caffe/util/opencl_device.hpp b/include/caffe/util/opencl_device.hpp
new file mode 100644
index 00000000000..d976d580916
--- /dev/null
+++ b/include/caffe/util/opencl_device.hpp
@@ -0,0 +1,178 @@
+// Copyright 2014 BVLC and contributors.
+
+#ifdef USE_OPENCL
+#ifndef CAFFE_UTIL_OPENCL_DEVICE_H_
+#define CAFFE_UTIL_OPENCL_DEVICE_H_
+
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/opencl.h>
+#endif
+#include "clBLAS.h"
+
+#include "glog/logging.h"
+
+#include "caffe/util/device.hpp"
+
+#include <vector>
+
+namespace caffe {
+
+#define CL_CHECK(condition) \
+    /* Code block avoids redefinition of cudaError_t error */ \
+    do { \
+      cl_int error = condition; \
+      CHECK_EQ(error, CL_SUCCESS) << " " << clGetErrorString(error); \
+    } while (0)
+
+#define CLBLAS_CHECK(condition) \
+  do { \
+    clblasStatus status = condition; \
+    CHECK_EQ(status, clblasSuccess) << " " \
+      << caffe::clblasGetErrorString(status); \
+  } while (0)
+
+const char* clGetErrorString(cl_int error);
+const char* clblasGetErrorString(clblasStatus status);
+
+class CaffeOpenCL {
+ public:
+  inline static CaffeOpenCL& Get() {
+    if (!singleton_.get()) {
+      singleton_.reset(new CaffeOpenCL());
+    }
+    return *singleton_;
+  }
+
+  virtual ~CaffeOpenCL() {
+  }
+
+  void SetDevice(const int device_id);
+  inline static cl_context context() {
+    if (Get().cl_context_ == NULL) {
+      Get().create_context();
+    }
+    return Get().cl_context_;
+  }
+  inline static cl_command_queue queue() {
+    if (Get().cl_command_queue_ == NULL) {
+      Get().create_queue();
+    }
+    return Get().cl_command_queue_;
+  }
+ protected:
+  cl_device_type get_device_type();
+  cl_device_id current_cl_device_id();
+  void create_context();
+  void release_context();
+  void create_queue();
+  void release_queue();
+  void initialize_clblas();
+  void finalize_clblas();
+ protected:
+  static shared_ptr<CaffeOpenCL> singleton_;
+
+  int current_device_id_;
+  cl_platform_id current_cl_platform_id_;
+  cl_int current_platform_device_count_;
+  std::vector<cl_device_id> current_platform_device_ids_;
+  int current_platform_device_id_;
+  cl_context cl_context_;
+  cl_command_queue cl_command_queue_;
+  bool clblas_initialized_;
+ private:
+  CaffeOpenCL() :
+    current_device_id_(0), current_cl_platform_id_(NULL),
+    current_platform_device_count_(0), current_platform_device_id_(0),
+    cl_context_(NULL), cl_command_queue_(NULL), clblas_initialized_(false) {
+    initialize_clblas();
+  }
+
+  DISABLE_COPY_AND_ASSIGN(CaffeOpenCL);
+};
+
+
+template<typename Dtype>
+class OpenCLDevice : public Device<Dtype> {
+ public:
+  OpenCLDevice() : Device<Dtype>() {
+  }
+
+  virtual ~OpenCLDevice() {
+  }
+
+  virtual void gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+                    const int M, const int N, const int K, const Dtype alpha,
+                    const Dtype* A, const Dtype* B, const Dtype beta, Dtype* C);
+
+  virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+                    const Dtype alpha, const Dtype* A, const Dtype* x,
+                    const Dtype beta, Dtype* y);
+
+  virtual void axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y);
+
+  virtual void axpby(const int N, const Dtype alpha, const Dtype* X,
+                     const Dtype beta, Dtype* Y);
+
+  virtual void copy(const int N, const Dtype *X, Dtype *Y);
+  virtual void copy_from_cpu(const int N, const Dtype* X, Dtype* Y);
+
+  virtual void set(const int N, const Dtype alpha, Dtype *X);
+
+//  virtual void add_scalar(const int N, const Dtype alpha, Dtype *X);
+
+  virtual void scal(const int N, const Dtype alpha, Dtype *X);
+
+//  virtual void sqr(const int N, const Dtype* a, Dtype* y);
+//
+//  virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+//
+//  virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+//
+//  virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+//
+//  virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+//  virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y);
+
+//  virtual void rng_uniform(const int N, const Dtype a, const Dtype b, Dtype* r);
+//
+//  virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma,
+//                            Dtype* r);
+//
+//  virtual void rng_bernoulli(const int N, const Dtype p, int* r);
+
+//  virtual void exp(const int N, const Dtype* a, Dtype* y);
+
+//  virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out);
+//
+//  virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y,
+//                                uint32_t* out);
+
+// Returns the sum of the absolute values of the elements of vector x
+//  virtual void asum(const int N, const Dtype* x, Dtype* y);
+
+//  virtual void sign(const int N, const Dtype* x, Dtype* y);
+
+//  virtual void sgnbit(const int N, const Dtype* x, Dtype* y);
+
+//  virtual void fabs(const int N, const Dtype* x, Dtype* y);
+
+//  virtual void scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y);
+
+//  virtual void im2col(const Dtype* data_im, const int channels,
+//      const int height, const int width, const int ksize, const int pad,
+//      const int stride, Dtype* data_col);
+//
+//  virtual void col2im(const Dtype* data_col, const int channels,
+//      const int height, const int width, const int psize, const int pad,
+//      const int stride, Dtype* data_im);
+};
+
+
+}  // namespace caffe
+
+#endif  // CAFFE_UTIL_OPENCL_DEVICE_H_
+#endif  // USE_OPENCL
+
diff --git a/include/caffe/util/opencl_math_functions.hpp b/include/caffe/util/opencl_math_functions.hpp
new file mode 100644
index 00000000000..1927c58cc80
--- /dev/null
+++ b/include/caffe/util/opencl_math_functions.hpp
@@ -0,0 +1,291 @@
+// Copyright 2014 BVLC and contributors.
+
+#ifdef USE_OPENCL
+#ifndef CAFFE_UTIL_OPENCL_MATH_FUNCTIONS_H_
+#define CAFFE_UTIL_OPENCL_MATH_FUNCTIONS_H_
+
+#include "caffe/util/opencl_device.hpp"
+
+namespace caffe {
+
+#define CREATE_CL_MEM(A, M, K, FLAG) \
+  cl_mem buf##A; \
+  do { \
+    cl_int error; \
+    buf##A = clCreateBuffer( \
+      CaffeOpenCL::context(), CL_MEM_##FLAG, M * K * sizeof(*A), \
+      NULL, &error); \
+    CL_CHECK(error); \
+  } while(0)
+
+#define RELEASE_CL_MEM(A) clReleaseMemObject(buf##A)
+
+#define ENQUEUE_CL_BUFFER(FLAG, A, M, K) \
+  CL_CHECK(clEnqueue##FLAG##Buffer( \
+    CaffeOpenCL::queue(), buf##A, CL_TRUE, 0, M * K * sizeof(*A), \
+    A, 0, NULL, NULL));
+
+#define PRE_CLBLAS_CALL \
+  cl_uint num_command_queues = 1; \
+  cl_uint num_events_in_wait_list = 0; \
+  cl_event *event_wait_list = NULL; \
+  cl_event events = NULL; \
+  cl_command_queue queue = CaffeOpenCL::queue();
+
+#define ARRAY(A) buf##A, 0, ld##A
+
+#define CLBLAS_TRAILING_ARGS \
+    num_command_queues, &queue, num_events_in_wait_list, \
+    event_wait_list, &events
+
+#define OPENCL_UNARY_KERNEL(Dtype_str, name_str, operation_str) \
+"template<typename Dtype>           \n" \
+"__kernel void " name_str "(        \n" \
+"  __global " Dtype_str "* x,       \n" \
+"  __global " Dtype_str "* y,       \n" \
+"  const unsigned int count) {      \n" \
+"  for (int i = get_global_id(0);   \n" \
+"       i < (count);                \n" \
+"       i += get_global_size(0)) {  \n" \
+"       " operation_str ";              \n" \
+"}                                  \n" \
+"\n";
+
+#define OPENCL_BINARY_KERNEL(Dtype_str, name_str, operation_str) \
+"__kernel void " name_str "(        \n" \
+"  __global " Dtype_str "* a,       \n" \
+"  __global " Dtype_str "* b,       \n" \
+"  __global " Dtype_str "* y,       \n" \
+"  const unsigned int count) {      \n" \
+"  for (int i = get_global_id(0);   \n" \
+"       i < (count);                \n" \
+"       i += get_global_size(0)) {  \n" \
+"       " operation_str ";              \n" \
+"}                                  \n" \
+"\n"
+
+// local_size: Number of work items in each local work group
+// global_size: Number of total work items
+#define DEFINE_LOCAL_AND_GLOBAL_SIZE(n) \
+  const size_t local_size = 64; \
+  const size_t global_size = (n + local_size - 1) \
+    / local_size
+
+
+// https://www.olcf.ornl.gov/tutorials/opencl-vector-addition/
+#define DEFINE_OPENCL_UNARY_FUNC(Dtype, name, operation) \
+template <> \
+void caffe_opencl_##name<Dtype>(const int n, const Dtype *x, Dtype *y) { \
+  const char* kernel_source = OPENCL_UNARY_KERNEL(#Dtype, #name, \
+                                                  #operation); \
+  cl_context context = CaffeOpenCL::context(); \
+  cl_command_queue queue = CaffeOpenCL::queue(); \
+  cl_int error; \
+  const size_t bytes = n * sizeof(Dtype); \
+  cl_program program = clCreateProgramWithSource( \
+    context, 1, (const char **) & kernel_source, NULL, &error); \
+  CL_CHECK(error); \
+  clBuildProgram(program, 0, NULL, NULL, NULL, NULL); \
+  cl_kernel kernel = clCreateKernel(program, #name, &error); \
+  CL_CHECK(error); \
+  cl_mem d_x = clCreateBuffer(context, \
+                              CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, \
+                              bytes, \
+                              const_cast<void*>(static_cast<const void*>(x)), \
+                              &error); \
+  cl_mem d_y = clCreateBuffer(context, \
+                              CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, \
+                              bytes, static_cast<void*>(y), &error); \
+  void* mapped_x = clEnqueueMapBuffer( \
+    queue, d_x, CL_TRUE, CL_MAP_READ, 0, bytes, 0, NULL, NULL, &error); \
+  CL_CHECK(error); \
+  CL_CHECK(clEnqueueUnmapMemObject( \
+    CaffeOpenCL::queue(), d_x, mapped_x, \
+    0, NULL, NULL)); \
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_x)); \
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_y)); \
+  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(unsigned int), &n)); \
+  DEFINE_LOCAL_AND_GLOBAL_SIZE(n); \
+  CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, \
+                                  &local_size, 0, NULL, NULL)); \
+  CL_CHECK(clFinish(queue)); \
+  CL_CHECK(clEnqueueReadBuffer(queue, d_y, CL_TRUE, 0, \
+                               bytes, y, 0, NULL, NULL )); \
+  void* mapped_y = clEnqueueMapBuffer( \
+   queue, d_y, CL_TRUE, CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &error); \
+  CL_CHECK(error); \
+  CL_CHECK(clEnqueueUnmapMemObject( \
+    CaffeOpenCL::queue(), d_y, mapped_y, \
+    0, NULL, NULL)); \
+  CL_CHECK(clReleaseMemObject(d_x)); \
+  CL_CHECK(clReleaseMemObject(d_y)); \
+  CL_CHECK(clReleaseProgram(program)); \
+  CL_CHECK(clReleaseKernel(kernel)); \
+}
+
+#define DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(name, operation) \
+    DEFINE_OPENCL_UNARY_FUNC(float, name, operation) \
+    DEFINE_OPENCL_UNARY_FUNC(double, name, operation)
+
+#define DEFINE_OPENCL_BINARY_FUNC(Dtype, name, operation) \
+template <> \
+void caffe_opencl_##name<Dtype>(const int n, const Dtype *a, const Dtype *b, \
+                         Dtype *y) { \
+  const char* kernel_source = OPENCL_BINARY_KERNEL(#Dtype, #name, \
+                                                   #operation); \
+  cl_context context = CaffeOpenCL::context(); \
+  cl_command_queue queue = CaffeOpenCL::queue(); \
+  cl_int error; \
+  const size_t bytes = n * sizeof(Dtype); \
+  cl_program program = clCreateProgramWithSource( \
+    context, 1, (const char **) & kernel_source, NULL, &error); \
+  CL_CHECK(error); \
+  clBuildProgram(program, 0, NULL, NULL, NULL, NULL); \
+  cl_kernel kernel = clCreateKernel(program, #name, &error); \
+  CL_CHECK(error); \
+  cl_mem d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, \
+                              const_cast<void*>(static_cast<const void*>(a)),\
+                              &error); \
+  cl_mem d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, \
+                              const_cast<void*>(static_cast<const void*>(b)), \
+                              &error); \
+  cl_mem d_y = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, \
+                              static_cast<void*>(y), &error); \
+  void* mapped_a = clEnqueueMapBuffer( \
+    queue, d_a, CL_TRUE, CL_MAP_READ, 0, bytes, 0, NULL, NULL, &error); \
+  CL_CHECK(error); \
+  CL_CHECK(clEnqueueUnmapMemObject( \
+    CaffeOpenCL::queue(), d_a, mapped_a, \
+    0, NULL, NULL)); \
+  void* mapped_b = clEnqueueMapBuffer( \
+    queue, d_b, CL_TRUE, CL_MAP_READ, 0, bytes, 0, NULL, NULL, &error); \
+  CL_CHECK(error); \
+  CL_CHECK(clEnqueueUnmapMemObject( \
+    CaffeOpenCL::queue(), d_b, mapped_b, \
+    0, NULL, NULL)); \
+  CL_CHECK(clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, \
+                                bytes, a, 0, NULL, NULL)); \
+  CL_CHECK(clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0, \
+                                bytes, b, 0, NULL, NULL)); \
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a)); \
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b)); \
+  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_y)); \
+  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(unsigned int), &n)); \
+  DEFINE_LOCAL_AND_GLOBAL_SIZE(n); \
+  CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, \
+                                  &local_size, 0, NULL, NULL)); \
+  CL_CHECK(clFinish(queue)); \
+  void* mapped_y = clEnqueueMapBuffer( \
+   queue, d_y, CL_TRUE, CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &error); \
+  CL_CHECK(error); \
+  CL_CHECK(clEnqueueUnmapMemObject( \
+    CaffeOpenCL::queue(), d_y, mapped_y, \
+    0, NULL, NULL)); \
+  CL_CHECK(clReleaseMemObject(d_a)); \
+  CL_CHECK(clReleaseMemObject(d_b)); \
+  CL_CHECK(clReleaseMemObject(d_y)); \
+  CL_CHECK(clReleaseProgram(program)); \
+  CL_CHECK(clReleaseKernel(kernel)); \
+}
+
+#define DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(name, operation) \
+    DEFINE_OPENCL_BINARY_FUNC(float, name, operation) \
+    DEFINE_OPENCL_BINARY_FUNC(double, name, operation)
+
+inline clblasTranspose to_clblasTranspose(const CBLAS_TRANSPOSE trans) {
+  switch (trans) {
+  case CblasNoTrans:
+    return clblasNoTrans;
+  case CblasTrans:
+    return clblasTrans;
+  case CblasConjTrans:
+    return clblasConjTrans;
+  default:
+    LOG(FATAL) << "Unknown CBLAS_TRANSPOSE " << trans;
+  }
+}
+
+template <typename Dtype>
+void caffe_opencl_gemm(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int n, const int K,
+    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
+    Dtype* C);
+
+
+template <typename Dtype>
+void caffe_opencl_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int n,
+    const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
+    Dtype* y);
+
+template <typename Dtype>
+void caffe_opencl_axpy(const int n, const Dtype alpha, const Dtype* x,
+    Dtype* y);
+
+template <typename Dtype>
+void caffe_opencl_axpby(const int n, const Dtype alpha, const Dtype* x,
+    const Dtype beta, Dtype* y);
+
+template <typename Dtype>
+void caffe_opencl_copy(const int n, const Dtype *x, Dtype *y);
+
+template <typename Dtype>
+void caffe_opencl_set(const int n, const Dtype alpha, Dtype *x);
+
+template <typename Dtype>
+void caffe_opencl_add_scalar(const int n, const Dtype alpha, Dtype *x);
+
+template <typename Dtype>
+void caffe_opencl_scal(const int n, const Dtype alpha, Dtype *x);
+
+template <typename Dtype>
+Dtype caffe_opencl_dot(const int n, const Dtype* x, const Dtype* y);
+
+template <typename Dtype>
+int caffe_opencl_hamming_distance(const int n, const Dtype* x, const Dtype* y);
+
+// Returns the sum of the absolute values of the elements of vector x
+template <typename Dtype>
+Dtype caffe_opencl_asum(const int n, const Dtype* x);
+
+template <typename Dtype>
+void caffe_opencl_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
+
+template<typename Dtype>
+void caffe_opencl_copy_from_cpu(const int n, const Dtype *x, Dtype *y);
+
+template<typename Dtype>
+void caffe_opencl_sqr(const int n, const Dtype* x, Dtype* y);
+
+template<typename Dtype>
+void caffe_opencl_exp(const int n, const Dtype* x, Dtype* y);
+
+template<typename Dtype>
+void caffe_opencl_sign(const int n, const Dtype* x, Dtype* y);
+
+template<typename Dtype>
+void caffe_opencl_sgnbit(const int n, const Dtype* x, Dtype* y);
+
+template<typename Dtype>
+void caffe_opencl_fabs(const int n, const Dtype* x, Dtype* y);
+
+template<typename Dtype>
+void caffe_opencl_add(const int n, const Dtype* a,
+                      const Dtype* b, Dtype* y);
+
+template<typename Dtype>
+void caffe_opencl_sub(const int n, const Dtype* a,
+                      const Dtype* b, Dtype* y);
+
+template<typename Dtype>
+void caffe_opencl_mul(const int n, const Dtype* a,
+                      const Dtype* b, Dtype* y);
+
+template<typename Dtype>
+void caffe_opencl_div(const int n, const Dtype* a,
+                      const Dtype* b, Dtype* y);
+}  // namespace caffe
+
+
+#endif  // CAFFE_UTIL_MATH_FUNCTIONS_H_
+#endif  // USE_OPENCL
+
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index b68dcbf6e83..abe6f0c6483 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -33,6 +33,12 @@ class ArgMaxLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
+    NOT_IMPLEMENTED;
+  }
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_ARGMAX;
@@ -41,12 +47,6 @@ class ArgMaxLayer : public Layer<Dtype> {
   virtual inline int ExactNumTopBlobs() const { return 1; }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
-    NOT_IMPLEMENTED;
-  }
   bool out_max_val_;
 };
 
@@ -61,6 +61,11 @@ class ConcatLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down,
+      vector<Blob<Dtype>*>* bottom);
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_CONCAT;
@@ -69,15 +74,6 @@ class ConcatLayer : public Layer<Dtype> {
   virtual inline int ExactNumTopBlobs() const { return 1; }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-
   Blob<Dtype> col_bob_;
   int count_;
   int num_;
@@ -96,6 +92,10 @@ class ConvolutionLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_CONVOLUTION;
@@ -104,15 +104,6 @@ class ConvolutionLayer : public Layer<Dtype> {
   virtual inline int ExactNumTopBlobs() const { return 1; }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-
   int kernel_size_;
   int stride_;
   int num_;
@@ -140,6 +131,10 @@ class EltwiseLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_ELTWISE;
@@ -148,15 +143,6 @@ class EltwiseLayer : public Layer<Dtype> {
   virtual inline int ExactNumTopBlobs() const { return 1; }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-
   EltwiseParameter_EltwiseOp op_;
   vector<Dtype> coeffs_;
 };
@@ -170,6 +156,10 @@ class FlattenLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_FLATTEN;
@@ -178,15 +168,6 @@ class FlattenLayer : public Layer<Dtype> {
   virtual inline int ExactNumTopBlobs() const { return 1; }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-
   int count_;
 };
 
@@ -199,6 +180,10 @@ class Im2colLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_IM2COL;
@@ -207,15 +192,6 @@ class Im2colLayer : public Layer<Dtype> {
   virtual inline int ExactNumTopBlobs() const { return 1; }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-
   int kernel_size_;
   int stride_;
   int channels_;
@@ -233,6 +209,10 @@ class InnerProductLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 
   virtual inline LayerParameter_LayerType type() const {
     return LayerParameter_LayerType_INNER_PRODUCT;
@@ -241,15 +221,6 @@ class InnerProductLayer : public Layer<Dtype> {
   virtual inline int ExactNumTopBlobs() const { return 1; }
 
  protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-
   int M_;
   int K_;
   int N_;
@@ -414,23 +385,12 @@ class SplitLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-
-  virtual inline LayerParameter_LayerType type() const {
-    return LayerParameter_LayerType_SPLIT;
-  }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int MinTopBlobs() const { return 1; }
-
- protected:
-  virtual Dtype Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top);
-  virtual Dtype Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+  virtual Dtype Forward(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+  virtual void Backward(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 
+ protected:
   int count_;
 };
 
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index e603712fd82..64d77bdbfeb 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -96,6 +96,58 @@ Dtype* Blob<Dtype>::mutable_gpu_diff() {
   return reinterpret_cast<Dtype*>(diff_->mutable_gpu_data());
 }
 
+template <typename Dtype>
+const Dtype* Blob<Dtype>::const_data() const {
+  switch (Caffe::mode()) {
+  case Caffe::CPU:
+    return cpu_data();
+  case Caffe::GPU:
+    return gpu_data();
+  default:
+    LOG(FATAL) << "Unknown caffe mode.";
+    return static_cast<Dtype*>(0);
+  }
+}
+
+template <typename Dtype>
+const Dtype* Blob<Dtype>::const_diff() const {
+  switch (Caffe::mode()) {
+  case Caffe::CPU:
+    return cpu_diff();
+  case Caffe::GPU:
+    return gpu_diff();
+  default:
+    LOG(FATAL) << "Unknown caffe mode.";
+    return static_cast<Dtype*>(0);
+  }
+}
+
+template <typename Dtype>
+Dtype* Blob<Dtype>::mutable_data() {
+  switch (Caffe::mode()) {
+  case Caffe::CPU:
+    return mutable_cpu_data();
+  case Caffe::GPU:
+    return mutable_gpu_data();
+  default:
+    LOG(FATAL) << "Unknown caffe mode.";
+    return static_cast<Dtype*>(NULL);
+  }
+}
+
+template <typename Dtype>
+Dtype* Blob<Dtype>::mutable_diff() {
+  switch (Caffe::mode()) {
+  case Caffe::CPU:
+    return mutable_cpu_diff();
+  case Caffe::GPU:
+    return mutable_gpu_diff();
+  default:
+    LOG(FATAL) << "Unknown caffe mode.";
+    return static_cast<Dtype*>(NULL);
+  }
+}
+
 template <typename Dtype>
 void Blob<Dtype>::ShareData(const Blob& other) {
   CHECK_EQ(count_, other.count());
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index 631c8afd068..82d4f16cd70 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -10,7 +10,6 @@ namespace caffe {
 
 shared_ptr<Caffe> Caffe::singleton_;
 
-
 // curand seeding
 int64_t cluster_seedgen(void) {
   int64_t s, seed, pid;
diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp
index 409965519ca..4c68c4a8fd4 100644
--- a/src/caffe/layers/accuracy_layer.cpp
+++ b/src/caffe/layers/accuracy_layer.cpp
@@ -30,11 +30,11 @@ void AccuracyLayer<Dtype>::SetUp(
 }
 
 template <typename Dtype>
-Dtype AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype AccuracyLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   Dtype accuracy = 0;
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  const Dtype* bottom_label = bottom[1]->cpu_data();
+  const Dtype* bottom_data = bottom[0]->const_data();
+  const Dtype* bottom_label = bottom[1]->const_data();
   int num = bottom[0]->num();
   int dim = bottom[0]->count() / bottom[0]->num();
   vector<Dtype> maxval(top_k_+1);
@@ -62,7 +62,7 @@ Dtype AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   }
 
   // LOG(INFO) << "Accuracy: " << accuracy;
-  (*top)[0]->mutable_cpu_data()[0] = accuracy / num;
+  (*top)[0]->mutable_data()[0] = accuracy / num;
 
   // Accuracy layer should not be used as a loss function.
   return Dtype(0);
diff --git a/src/caffe/layers/argmax_layer.cpp b/src/caffe/layers/argmax_layer.cpp
index cc31c0f52d8..1c0c402d5ea 100644
--- a/src/caffe/layers/argmax_layer.cpp
+++ b/src/caffe/layers/argmax_layer.cpp
@@ -24,10 +24,10 @@ void ArgMaxLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-Dtype ArgMaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype ArgMaxLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  const Dtype* bottom_data = bottom[0]->const_data();
+  Dtype* top_data = (*top)[0]->mutable_data();
   int num = bottom[0]->num();
   int dim = bottom[0]->count() / bottom[0]->num();
   for (int i = 0; i < num; ++i) {
diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp
index 95e6bd8748c..3ff9e3f99df 100644
--- a/src/caffe/layers/bnll_layer.cpp
+++ b/src/caffe/layers/bnll_layer.cpp
@@ -15,8 +15,8 @@ const float kBNLL_THRESHOLD = 50.;
 template <typename Dtype>
 Dtype BNLLLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  const Dtype* bottom_data = bottom[0]->const_data();
+  Dtype* top_data = (*top)[0]->mutable_data();
   const int count = bottom[0]->count();
   for (int i = 0; i < count; ++i) {
     top_data[i] = bottom_data[i] > 0 ?
@@ -31,9 +31,9 @@ void BNLLLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   if (propagate_down[0]) {
-    const Dtype* bottom_data = (*bottom)[0]->cpu_data();
-    const Dtype* top_diff = top[0]->cpu_diff();
-    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+    const Dtype* bottom_data = (*bottom)[0]->const_data();
+    const Dtype* top_diff = top[0]->const_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_diff();
     const int count = (*bottom)[0]->count();
     Dtype expval;
     for (int i = 0; i < count; ++i) {
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
index 4c894ddffc4..1b90181f20a 100644
--- a/src/caffe/layers/concat_layer.cpp
+++ b/src/caffe/layers/concat_layer.cpp
@@ -4,7 +4,7 @@
 
 #include "caffe/layer.hpp"
 #include "caffe/vision_layers.hpp"
-#include "caffe/util/math_functions.hpp"
+#include "caffe/util/device.hpp"
 
 namespace caffe {
 
@@ -41,63 +41,68 @@ void ConcatLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-Dtype ConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+Dtype ConcatLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  Dtype* top_data = (*top)[0]->mutable_data();
   if (concat_dim_== 0) {
     int offset_num = 0;
     for (int i = 0; i < bottom.size(); ++i) {
-      const Dtype* bottom_data = bottom[i]->cpu_data();
+      const Dtype* bottom_data = bottom[i]->const_data();
       int num_elem = bottom[i]->count();
-      caffe_copy(num_elem, bottom_data, top_data+(*top)[0]->offset(offset_num));
+      DeviceFactory<Dtype>::GetDevice()->copy(num_elem, bottom_data,
+                        top_data+(*top)[0]->offset(offset_num));
       offset_num += bottom[i]->num();
     }
   } else if (concat_dim_ == 1) {
     int offset_channel = 0;
     for (int i = 0; i < bottom.size(); ++i) {
-      const Dtype* bottom_data = bottom[i]->cpu_data();
+      const Dtype* bottom_data = bottom[i]->const_data();
       int num_elem =
         bottom[i]->channels()*bottom[i]->height()*bottom[i]->width();
       for (int n = 0; n < num_; ++n) {
-        caffe_copy(num_elem, bottom_data+bottom[i]->offset(n),
+        DeviceFactory<Dtype>::GetDevice()->copy(
+            num_elem, bottom_data+bottom[i]->offset(n),
           top_data+(*top)[0]->offset(n, offset_channel));
       }
       offset_channel += bottom[i]->channels();
-    }  // concat_dim_ is guaranteed to be 0 or 1 by SetUp.
+    }
+  } else {
+    LOG(FATAL) << "concat_dim along dim" << concat_dim_ <<
+      " not implemented yet";
   }
   return Dtype(0.);
 }
 
 template <typename Dtype>
-void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void ConcatLayer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
-  const Dtype* top_diff = top[0]->cpu_diff();
+  const Dtype* top_diff = top[0]->const_diff();
   if (concat_dim_ == 0) {
     int offset_num = 0;
     for (int i = 0; i < bottom->size(); ++i) {
       Blob<Dtype>* blob = (*bottom)[i];
-      if (propagate_down[i]) {
-        Dtype* bottom_diff = blob->mutable_cpu_diff();
-        caffe_copy(blob->count(), top_diff + top[0]->offset(offset_num),
-                   bottom_diff);
-      }
+      Dtype* bottom_diff = blob->mutable_diff();
+      DeviceFactory<Dtype>::GetDevice()->copy(blob->count(),
+        top_diff+top[0]->offset(offset_num), bottom_diff);
       offset_num += blob->num();
     }
   } else if (concat_dim_ == 1) {
     int offset_channel = 0;
     for (int i = 0; i < bottom->size(); ++i) {
       Blob<Dtype>* blob = (*bottom)[i];
-      if (propagate_down[i]) {
-        Dtype* bottom_diff = blob->mutable_cpu_diff();
-        int num_elem = blob->channels()*blob->height()*blob->width();
-        for (int n = 0; n < num_; ++n) {
-          caffe_copy(num_elem, top_diff + top[0]->offset(n, offset_channel),
-                     bottom_diff + blob->offset(n));
-        }
+      Dtype* bottom_diff = blob->mutable_diff();
+      int num_elem = blob->channels()*blob->height()*blob->width();
+      for (int n = 0; n < num_; ++n) {
+        DeviceFactory<Dtype>::GetDevice()->copy(
+            num_elem, top_diff+top[0]->offset(n, offset_channel),
+          bottom_diff+blob->offset(n));
       }
       offset_channel += blob->channels();
     }
-  }  // concat_dim_ is guaranteed to be 0 or 1 by SetUp.
+  } else {
+    LOG(FATAL) << "concat_dim along dim" << concat_dim_ <<
+      " not implemented yet";
+  }
 }
 
 INSTANTIATE_CLASS(ConcatLayer);
diff --git a/src/caffe/layers/concat_layer.cu b/src/caffe/layers/concat_layer.cu
deleted file mode 100644
index ca0cf0c1b5b..00000000000
--- a/src/caffe/layers/concat_layer.cu
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-Dtype ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  Dtype* top_data = (*top)[0]->mutable_gpu_data();
-  if (concat_dim_ == 0) {
-    int offset_num = 0;
-    for (int i = 0; i < bottom.size(); ++i) {
-      const Dtype* bottom_data = bottom[i]->gpu_data();
-      caffe_gpu_copy(bottom[i]->count(), bottom_data,
-        top_data + (*top)[0]->offset(offset_num));
-      offset_num += bottom[i]->num();
-    }
-  } else if (concat_dim_ == 1) {
-    int offset_channel = 0;
-    for (int i = 0; i < bottom.size(); ++i) {
-      const Dtype* bottom_data = bottom[i]->gpu_data();
-      int num_elem =
-        bottom[i]->channels() * bottom[i]->height() * bottom[i]->width();
-      for (int n = 0; n < num_; ++n) {
-        caffe_gpu_copy(num_elem, bottom_data+bottom[i]->offset(n),
-          top_data + (*top)[0]->offset(n, offset_channel));
-      }
-      offset_channel += bottom[i]->channels();
-    }
-  } else {
-    LOG(FATAL) << "concat_dim along dim" << concat_dim_ <<
-      " not implemented yet";
-  }
-  return Dtype(0.);
-}
-
-template <typename Dtype>
-void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  if (concat_dim_ == 0) {
-    int offset_num = 0;
-    for (int i = 0; i < bottom->size(); ++i) {
-      Blob<Dtype>* blob = (*bottom)[i];
-      if (propagate_down[i]) {
-        Dtype* bottom_diff = blob->mutable_gpu_diff();
-        caffe_gpu_copy(blob->count(), top_diff + top[0]->offset(offset_num),
-                       bottom_diff);
-      }
-      offset_num += blob->num();
-    }
-  } else if (concat_dim_ == 1) {
-    int offset_channel = 0;
-    for (int i = 0; i < bottom->size(); ++i) {
-      Blob<Dtype>* blob = (*bottom)[i];
-      if (propagate_down[i]) {
-        Dtype* bottom_diff = blob->mutable_gpu_diff();
-        int num_elem = blob->channels()*blob->height()*blob->width();
-        for (int n = 0; n < num_; ++n) {
-          caffe_gpu_copy(num_elem, top_diff + top[0]->offset(n, offset_channel),
-                         bottom_diff + blob->offset(n));
-        }
-      }
-      offset_channel += blob->channels();
-    }
-  } else {
-    LOG(FATAL) << "concat_dim along dim" << concat_dim_ <<
-      " not implemented yet";
-  }
-}
-
-INSTANTIATE_CLASS(ConcatLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 9ec8da47e1a..f791d32affb 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -68,7 +68,7 @@ void ConvolutionLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
   if (bias_term_) {
     bias_multiplier_.reset(new SyncedMemory(N_ * sizeof(Dtype)));
     Dtype* bias_multiplier_data =
-        reinterpret_cast<Dtype*>(bias_multiplier_->mutable_cpu_data());
+        reinterpret_cast<Dtype*>(bias_multiplier_->mutable_data());
     for (int i = 0; i < N_; ++i) {
         bias_multiplier_data[i] = 1.;
     }
@@ -77,30 +77,33 @@ void ConvolutionLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 
 
 template <typename Dtype>
-Dtype ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype ConvolutionLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
-  Dtype* col_data = col_buffer_.mutable_cpu_data();
-  const Dtype* weight = this->blobs_[0]->cpu_data();
+  const Dtype* bottom_data = bottom[0]->const_data();
+  Dtype* top_data = (*top)[0]->mutable_data();
+  Dtype* col_data = col_buffer_.mutable_data();
+  const Dtype* weight = this->blobs_[0]->const_data();
   int weight_offset = M_ * K_;
   int col_offset = K_ * N_;
   int top_offset = M_ * N_;
   for (int n = 0; n < num_; ++n) {
     // First, im2col
-    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-                      width_, kernel_size_, pad_, stride_, col_data);
+    DeviceFactory<Dtype>::GetDevice()->im2col(
+        bottom_data + bottom[0]->offset(n), channels_, height_,
+        width_, kernel_size_, pad_, stride_, col_data);
     // Second, innerproduct with groups
     for (int g = 0; g < group_; ++g) {
-      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
-        (Dtype)1., weight + weight_offset * g, col_data + col_offset * g,
-        (Dtype)0., top_data + (*top)[0]->offset(n) + top_offset * g);
+      DeviceFactory<Dtype>::GetDevice()->gemm(
+          CblasNoTrans, CblasNoTrans, M_, N_, K_,
+          (Dtype)1., weight + weight_offset * g, col_data + col_offset * g,
+          (Dtype)0., top_data + (*top)[0]->offset(n) + top_offset * g);
     }
     // third, add bias
     if (bias_term_) {
-      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
-          N_, 1, (Dtype)1., this->blobs_[1]->cpu_data(),
-          reinterpret_cast<const Dtype*>(bias_multiplier_->cpu_data()),
+      DeviceFactory<Dtype>::GetDevice()->gemm(
+          CblasNoTrans, CblasNoTrans, num_output_,
+          N_, 1, (Dtype)1., this->blobs_[1]->const_data(),
+          reinterpret_cast<const Dtype*>(bias_multiplier_->const_data()),
           (Dtype)1., top_data + (*top)[0]->offset(n));
     }
   }
@@ -108,25 +111,26 @@ Dtype ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void ConvolutionLayer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
-  const Dtype* top_diff = top[0]->cpu_diff();
-  const Dtype* weight = this->blobs_[0]->cpu_data();
-  Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
-  const Dtype* bottom_data = (*bottom)[0]->cpu_data();
-  Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
-  Dtype* col_data = col_buffer_.mutable_cpu_data();
-  Dtype* col_diff = col_buffer_.mutable_cpu_diff();
+  const Dtype* top_diff = top[0]->const_diff();
+  const Dtype* weight = this->blobs_[0]->const_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_diff();
+  const Dtype* bottom_data = (*bottom)[0]->const_data();
+  Dtype* bottom_diff = (*bottom)[0]->mutable_diff();
+  Dtype* col_data = col_buffer_.mutable_data();
+  Dtype* col_diff = col_buffer_.mutable_diff();
   // bias gradient if necessary
   Dtype* bias_diff = NULL;
 
   if (bias_term_) {
-    bias_diff = this->blobs_[1]->mutable_cpu_diff();
-    memset(bias_diff, 0, sizeof(Dtype) * this->blobs_[1]->count());
+    bias_diff = this->blobs_[1]->mutable_diff();
+    DeviceFactory<Dtype>::GetDevice()->set(
+        this->blobs_[1]->count(), 0, bias_diff);
     for (int n = 0; n < num_; ++n) {
-      caffe_cpu_gemv<Dtype>(CblasNoTrans, num_output_, N_,
+      DeviceFactory<Dtype>::GetDevice()->gemv(CblasNoTrans, num_output_, N_,
           1., top_diff + top[0]->offset(n),
-          reinterpret_cast<const Dtype*>(bias_multiplier_->cpu_data()), 1.,
+          reinterpret_cast<const Dtype*>(bias_multiplier_->const_data()), 1.,
           bias_diff);
     }
   }
@@ -134,29 +138,34 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   int weight_offset = M_ * K_;
   int col_offset = K_ * N_;
   int top_offset = M_ * N_;
-  memset(weight_diff, 0, sizeof(Dtype) * this->blobs_[0]->count());
+  DeviceFactory<Dtype>::GetDevice()->set(
+      this->blobs_[0]->count(), 0, weight_diff);
   for (int n = 0; n < num_; ++n) {
     // since we saved memory in the forward pass by not storing all col data,
     // we will need to recompute them.
-    im2col_cpu(bottom_data + (*bottom)[0]->offset(n), channels_, height_,
-                      width_, kernel_size_, pad_, stride_, col_data);
+    DeviceFactory<Dtype>::GetDevice()->im2col(
+        bottom_data + (*bottom)[0]->offset(n), channels_, height_,
+        width_, kernel_size_, pad_, stride_, col_data);
     // gradient w.r.t. weight. Note that we will accumulate diffs.
     for (int g = 0; g < group_; ++g) {
-      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
-        (Dtype)1., top_diff + top[0]->offset(n) + top_offset * g,
-        col_data + col_offset * g, (Dtype)1.,
-        weight_diff + weight_offset * g);
+      DeviceFactory<Dtype>::GetDevice()->gemm(
+          CblasNoTrans, CblasTrans, M_, K_, N_,
+          (Dtype)1., top_diff + top[0]->offset(n) + top_offset * g,
+          col_data + col_offset * g, (Dtype)1.,
+          weight_diff + weight_offset * g);
     }
     // gradient w.r.t. bottom data, if necessary
     if (propagate_down[0]) {
       for (int g = 0; g < group_; ++g) {
-        caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
-          (Dtype)1., weight + weight_offset * g,
-          top_diff + top[0]->offset(n) + top_offset * g,
-          (Dtype)0., col_diff + col_offset * g);
+        DeviceFactory<Dtype>::GetDevice()->gemm(
+            CblasTrans, CblasNoTrans, K_, N_, M_,
+            (Dtype)1., weight + weight_offset * g,
+            top_diff + top[0]->offset(n) + top_offset * g,
+            (Dtype)0., col_diff + col_offset * g);
       }
       // col2im back to the data
-      col2im_cpu(col_diff, channels_, height_, width_, kernel_size_, pad_,
+      DeviceFactory<Dtype>::GetDevice()->col2im(
+          col_diff, channels_, height_, width_, kernel_size_, pad_,
           stride_, bottom_diff + (*bottom)[0]->offset(n));
     }
   }
diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu
deleted file mode 100644
index 85f95fd32c9..00000000000
--- a/src/caffe/layers/conv_layer.cu
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-Dtype ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = (*top)[0]->mutable_gpu_data();
-  Dtype* col_data = col_buffer_.mutable_gpu_data();
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  int weight_offset = M_ * K_;
-  int col_offset = K_ * N_;
-  int top_offset = M_ * N_;
-  for (int n = 0; n < num_; ++n) {
-    // First, im2col
-    im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-                      width_, kernel_size_, pad_, stride_, col_data);
-    // Second, innerproduct with groups
-    for (int g = 0; g < group_; ++g) {
-      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
-        (Dtype)1., weight + weight_offset * g, col_data + col_offset * g,
-        (Dtype)0., top_data + (*top)[0]->offset(n) + top_offset * g);
-    }
-    // third, add bias
-    if (bias_term_) {
-      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
-          N_, 1, (Dtype)1., this->blobs_[1]->gpu_data(),
-          reinterpret_cast<const Dtype*>(bias_multiplier_->gpu_data()),
-          (Dtype)1., top_data + (*top)[0]->offset(n));
-    }
-  }
-  return Dtype(0.);
-}
-
-template <typename Dtype>
-void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  const Dtype* bottom_data = (*bottom)[0]->gpu_data();
-  Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
-  Dtype* col_data = col_buffer_.mutable_gpu_data();
-  Dtype* col_diff = col_buffer_.mutable_gpu_diff();
-  // bias gradient if necessary
-  Dtype* bias_diff = NULL;
-
-  if (bias_term_) {
-    bias_diff = this->blobs_[1]->mutable_gpu_diff();
-    CUDA_CHECK(cudaMemset(bias_diff, 0,
-        sizeof(Dtype) * this->blobs_[1]->count()));
-    for (int n = 0; n < num_; ++n) {
-      caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, N_,
-          1., top_diff + top[0]->offset(n),
-          reinterpret_cast<const Dtype*>(bias_multiplier_->gpu_data()),
-          1., bias_diff);
-    }
-  }
-
-  int weight_offset = M_ * K_;
-  int col_offset = K_ * N_;
-  int top_offset = M_ * N_;
-  CUDA_CHECK(cudaMemset(weight_diff, 0,
-      sizeof(Dtype) * this->blobs_[0]->count()));
-  for (int n = 0; n < num_; ++n) {
-    // since we saved memory in the forward pass by not storing all col data,
-    // we will need to recompute them.
-    im2col_gpu(bottom_data + (*bottom)[0]->offset(n), channels_, height_,
-                      width_, kernel_size_, pad_, stride_, col_data);
-    // gradient w.r.t. weight. Note that we will accumulate diffs.
-    for (int g = 0; g < group_; ++g) {
-      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
-        (Dtype)1., top_diff + top[0]->offset(n) + top_offset * g,
-        col_data + col_offset * g, (Dtype)1.,
-        weight_diff + weight_offset * g);
-    }
-    // gradient w.r.t. bottom data, if necessary
-    if (propagate_down[0]) {
-      for (int g = 0; g < group_; ++g) {
-        caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
-          (Dtype)1., weight + weight_offset * g,
-          top_diff + top[0]->offset(n) + top_offset * g,
-          (Dtype)0., col_diff + col_offset * g);
-      }
-      // col2im back to the data
-      col2im_gpu(col_diff, channels_, height_, width_, kernel_size_, pad_,
-          stride_, bottom_diff + (*bottom)[0]->offset(n));
-    }
-  }
-}
-
-
-INSTANTIATE_CLASS(ConvolutionLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index 29c4fec8ca4..c3503d838cd 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -25,10 +25,10 @@ void* DataLayerPrefetch(void* layer_pointer) {
   CHECK(layer);
   Datum datum;
   CHECK(layer->prefetch_data_);
-  Dtype* top_data = layer->prefetch_data_->mutable_cpu_data();
+  Dtype* top_data = layer->prefetch_data_->mutable_data();
   Dtype* top_label = NULL;  // suppress warnings about uninitialized variables
   if (layer->output_labels_) {
-    top_label = layer->prefetch_label_->mutable_cpu_data();
+    top_label = layer->prefetch_label_->mutable_data();
   }
   const Dtype scale = layer->layer_param_.data_param().scale();
   const int batch_size = layer->layer_param_.data_param().batch_size();
@@ -44,7 +44,7 @@ void* DataLayerPrefetch(void* layer_pointer) {
   const int height = layer->datum_height_;
   const int width = layer->datum_width_;
   const int size = layer->datum_size_;
-  const Dtype* mean = layer->data_mean_.cpu_data();
+  const Dtype* mean = layer->data_mean_.const_data();
   for (int item_id = 0; item_id < batch_size; ++item_id) {
     // get a blob
     switch (layer->layer_param_.data_param().backend()) {
@@ -302,14 +302,14 @@ void DataLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
     data_mean_.Reshape(1, datum_channels_, datum_height_, datum_width_);
   }
   // Now, start the prefetch thread. Before calling prefetch, we make two
-  // cpu_data calls so that the prefetch thread does not accidentally make
+  // const_data calls so that the prefetch thread does not accidentally make
   // simultaneous cudaMalloc calls when the main thread is running. In some
   // GPUs this seems to cause failures if we do not so.
-  prefetch_data_->mutable_cpu_data();
+  prefetch_data_->mutable_data();
   if (output_labels_) {
-    prefetch_label_->mutable_cpu_data();
+    prefetch_label_->mutable_data();
   }
-  data_mean_.cpu_data();
+  data_mean_.const_data();
   DLOG(INFO) << "Initializing prefetch";
   CreatePrefetchThread();
   DLOG(INFO) << "Prefetch initialized.";
@@ -346,16 +346,18 @@ unsigned int DataLayer<Dtype>::PrefetchRand() {
 }
 
 template <typename Dtype>
-Dtype DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype DataLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   // First, join the thread
   JoinPrefetchThread();
   // Copy the data
-  caffe_copy(prefetch_data_->count(), prefetch_data_->cpu_data(),
-             (*top)[0]->mutable_cpu_data());
+  DeviceFactory<Dtype>::GetDevice()->copy_from_cpu(
+      prefetch_data_->count(), prefetch_data_->const_data(),
+      (*top)[0]->mutable_data());
   if (output_labels_) {
-    caffe_copy(prefetch_label_->count(), prefetch_label_->cpu_data(),
-               (*top)[1]->mutable_cpu_data());
+    DeviceFactory<Dtype>::GetDevice()->copy_from_cpu(
+        prefetch_label_->count(), prefetch_label_->const_data(),
+        (*top)[1]->mutable_data());
   }
   // Start a new prefetch thread
   CreatePrefetchThread();
diff --git a/src/caffe/layers/data_layer.cu b/src/caffe/layers/data_layer.cu
deleted file mode 100644
index 2ff9a292b3e..00000000000
--- a/src/caffe/layers/data_layer.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-
-#include <stdint.h>
-#include <leveldb/db.h>
-#include <pthread.h>
-
-#include <string>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/vision_layers.hpp"
-
-using std::string;
-
-namespace caffe {
-
-template <typename Dtype>
-Dtype DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  // First, join the thread
-  JoinPrefetchThread();
-  // Copy the data
-  CUDA_CHECK(cudaMemcpy((*top)[0]->mutable_gpu_data(),
-      prefetch_data_->cpu_data(), sizeof(Dtype) * prefetch_data_->count(),
-      cudaMemcpyHostToDevice));
-  if (output_labels_) {
-    CUDA_CHECK(cudaMemcpy((*top)[1]->mutable_gpu_data(),
-        prefetch_label_->cpu_data(), sizeof(Dtype) * prefetch_label_->count(),
-        cudaMemcpyHostToDevice));
-  }
-  // Start a new prefetch thread
-  CreatePrefetchThread();
-  return Dtype(0.);
-}
-
-INSTANTIATE_CLASS(DataLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index e9a1a524d63..eafb74aaae7 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -29,18 +29,20 @@ void DropoutLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 Dtype DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
-  unsigned int* mask = rand_vec_->mutable_cpu_data();
+  const Dtype* bottom_data = bottom[0]->const_data();
+  Dtype* top_data = (*top)[0]->mutable_data();
+  int* mask = reinterpret_cast<int*>(rand_vec_->mutable_data());
   const int count = bottom[0]->count();
   if (Caffe::phase() == Caffe::TRAIN) {
     // Create random numbers
-    caffe_rng_bernoulli(count, 1. - threshold_, mask);
+    DeviceFactory<Dtype>::GetDevice()->rng_bernoulli(count, 1. - threshold_,
+                                                     mask);
     for (int i = 0; i < count; ++i) {
       top_data[i] = bottom_data[i] * mask[i] * scale_;
     }
   } else {
-    caffe_copy(bottom[0]->count(), bottom_data, top_data);
+    DeviceFactory<Dtype>::GetDevice()->copy(bottom[0]->count(), bottom_data,
+                                            top_data);
   }
   return Dtype(0);
 }
@@ -51,9 +53,9 @@ void DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     vector<Blob<Dtype>*>* bottom) {
   CHECK(Caffe::phase() == Caffe::TRAIN);
   if (propagate_down[0]) {
-    const Dtype* top_diff = top[0]->cpu_diff();
-    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
-    const unsigned int* mask = rand_vec_->cpu_data();
+    const Dtype* top_diff = top[0]->const_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_diff();
+    const unsigned int* mask = rand_vec_->const_data();
     const int count = (*bottom)[0]->count();
     for (int i = 0; i < count; ++i) {
       bottom_diff[i] = top_diff[i] * mask[i] * scale_;
@@ -61,8 +63,6 @@ void DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-
 INSTANTIATE_CLASS(DropoutLayer);
 
-
 }  // namespace caffe
diff --git a/src/caffe/layers/dummy_data_layer.cpp b/src/caffe/layers/dummy_data_layer.cpp
index 58044f4c952..a4b0b17a2e4 100644
--- a/src/caffe/layers/dummy_data_layer.cpp
+++ b/src/caffe/layers/dummy_data_layer.cpp
@@ -84,7 +84,7 @@ void DummyDataLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-Dtype DummyDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype DummyDataLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   for (int i = 0; i < top->size(); ++i) {
     const int filler_id = (fillers_.size() > 1) ? i : 0;
diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index 2c265f6678f..adf879e6842 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -41,22 +41,25 @@ void EltwiseLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-Dtype EltwiseLayer<Dtype>::Forward_cpu(
+Dtype EltwiseLayer<Dtype>::Forward(
     const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
   const int count = (*top)[0]->count();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  Dtype* top_data = (*top)[0]->mutable_data();
   switch (op_) {
   case EltwiseParameter_EltwiseOp_PROD:
-    caffe_mul(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), top_data);
+    DeviceFactory<Dtype>::GetDevice()->mul(count, bottom[0]->const_data(),
+                       bottom[1]->const_data(), top_data);
     for (int i = 2; i < bottom.size(); ++i) {
-      caffe_mul(count, top_data, bottom[i]->cpu_data(), top_data);
+      DeviceFactory<Dtype>::GetDevice()->mul(
+          count, top_data, bottom[i]->const_data(), top_data);
     }
     break;
   case EltwiseParameter_EltwiseOp_SUM:
-    caffe_set(count, Dtype(0), top_data);
+    DeviceFactory<Dtype>::GetDevice()->set(count, Dtype(0), top_data);
     // TODO(shelhamer) does BLAS optimize to sum for coeff = 1?
     for (int i = 0; i < bottom.size(); ++i) {
-      caffe_axpy(count, coeffs_[i], bottom[i]->cpu_data(), top_data);
+      DeviceFactory<Dtype>::GetDevice()->axpy(
+          count, coeffs_[i], bottom[i]->const_data(), top_data);
     }
     break;
   default:
@@ -66,25 +69,29 @@ Dtype EltwiseLayer<Dtype>::Forward_cpu(
 }
 
 template <typename Dtype>
-void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void EltwiseLayer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
   const int count = top[0]->count();
-  const Dtype* top_data = top[0]->cpu_data();
-  const Dtype* top_diff = top[0]->cpu_diff();
+  const Dtype* top_data = top[0]->const_data();
+  const Dtype* top_diff = top[0]->const_diff();
   for (int i = 0; i < bottom->size(); ++i) {
     if (propagate_down[i]) {
-      const Dtype* bottom_data = (*bottom)[i]->cpu_data();
-      Dtype* bottom_diff = (*bottom)[i]->mutable_cpu_diff();
+      const Dtype* bottom_data = (*bottom)[i]->const_data();
+      Dtype* bottom_diff = (*bottom)[i]->mutable_diff();
       switch (op_) {
       case EltwiseParameter_EltwiseOp_PROD:
-        caffe_div(count, top_data, bottom_data, bottom_diff);
-        caffe_mul(count, bottom_diff, top_diff, bottom_diff);
+        DeviceFactory<Dtype>::GetDevice()->div(
+            count, top_data, bottom_data, bottom_diff);
+        DeviceFactory<Dtype>::GetDevice()->mul(
+            count, bottom_diff, top_diff, bottom_diff);
         break;
       case EltwiseParameter_EltwiseOp_SUM:
         if (coeffs_[i] == Dtype(1)) {
-          caffe_copy(count, top_diff, bottom_diff);
+          DeviceFactory<Dtype>::GetDevice()->copy(
+              count, top_diff, bottom_diff);
         } else {
-          caffe_cpu_scale(count, coeffs_[i], top_diff, bottom_diff);
+          DeviceFactory<Dtype>::GetDevice()->scale(
+              count, coeffs_[i], top_diff, bottom_diff);
         }
         break;
       default:
diff --git a/src/caffe/layers/eltwise_layer.cu b/src/caffe/layers/eltwise_layer.cu
deleted file mode 100644
index 3860944889c..00000000000
--- a/src/caffe/layers/eltwise_layer.cu
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-Dtype EltwiseLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
-  const int count = (*top)[0]->count();
-  Dtype* top_data = (*top)[0]->mutable_gpu_data();
-  switch (op_) {
-  case EltwiseParameter_EltwiseOp_PROD:
-    caffe_gpu_mul(count, bottom[0]->gpu_data(),
-        bottom[1]->gpu_data(), top_data);
-    for (int i = 2; i < bottom.size(); ++i) {
-      caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data);
-    }
-    break;
-  case EltwiseParameter_EltwiseOp_SUM:
-    caffe_gpu_set(count, Dtype(0.), top_data);
-    // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1?
-    for (int i = 0; i < bottom.size(); ++i) {
-      caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data);
-    }
-    break;
-  default:
-    LOG(FATAL) << "Unknown elementwise operation.";
-  }
-  return Dtype(0.);
-}
-
-template <typename Dtype>
-void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
-  const int count = top[0]->count();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  for (int i = 0; i < bottom->size(); ++i) {
-    if (propagate_down[i]) {
-      const Dtype* bottom_data = (*bottom)[i]->gpu_data();
-      Dtype* bottom_diff = (*bottom)[i]->mutable_gpu_diff();
-      switch (op_) {
-      case EltwiseParameter_EltwiseOp_PROD:
-        caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
-        caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
-        break;
-      case EltwiseParameter_EltwiseOp_SUM:
-        if (coeffs_[i] == Dtype(1.)) {
-          caffe_gpu_copy(count, top_diff, bottom_diff);
-        } else {
-          caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff);
-        }
-        break;
-      default:
-        LOG(FATAL) << "Unknown elementwise operation.";
-      }
-    }
-  }
-}
-
-INSTANTIATE_CLASS(EltwiseLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp
index 2478a514cac..a88ec8ea3cc 100644
--- a/src/caffe/layers/euclidean_loss_layer.cpp
+++ b/src/caffe/layers/euclidean_loss_layer.cpp
@@ -20,34 +20,36 @@ void EuclideanLossLayer<Dtype>::FurtherSetUp(
 }
 
 template <typename Dtype>
-Dtype EuclideanLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype EuclideanLossLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
   int count = bottom[0]->count();
-  caffe_sub(
+  DeviceFactory<Dtype>::GetDevice()->sub(
       count,
-      bottom[0]->cpu_data(),
-      bottom[1]->cpu_data(),
-      diff_.mutable_cpu_data());
-  Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data());
+      bottom[0]->const_data(),
+      bottom[1]->const_data(),
+      diff_.mutable_data());
+  Dtype dot;
+  DeviceFactory<Dtype>::GetDevice()->dot(count, diff_.const_data(),
+                                         diff_.const_data(), &dot);
   Dtype loss = dot / bottom[0]->num() / Dtype(2);
   if (top->size() == 1) {
-    (*top)[0]->mutable_cpu_data()[0] = loss;
+    (*top)[0]->mutable_data()[0] = loss;
   }
   return loss;
 }
 
 template <typename Dtype>
-void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void EuclideanLossLayer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
   for (int i = 0; i < 2; ++i) {
     if (propagate_down[i]) {
       const Dtype sign = (i == 0) ? 1 : -1;
-      caffe_cpu_axpby(
+      DeviceFactory<Dtype>::GetDevice()->axpby(
           (*bottom)[i]->count(),              // count
           sign / (*bottom)[i]->num(),         // alpha
-          diff_.cpu_data(),                   // a
+          diff_.const_data(),                   // a
           Dtype(0),                           // beta
-          (*bottom)[i]->mutable_cpu_diff());  // b
+          (*bottom)[i]->mutable_diff());  // b
     }
   }
 }
diff --git a/src/caffe/layers/euclidean_loss_layer.cu b/src/caffe/layers/euclidean_loss_layer.cu
deleted file mode 100644
index b070ea96ff8..00000000000
--- a/src/caffe/layers/euclidean_loss_layer.cu
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/util/io.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-Dtype EuclideanLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    vector<Blob<Dtype>*>* top) {
-  int count = bottom[0]->count();
-  caffe_gpu_sub(
-      count,
-      bottom[0]->gpu_data(),
-      bottom[1]->gpu_data(),
-      diff_.mutable_gpu_data());
-  Dtype dot;
-  caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot);
-  Dtype loss = dot / bottom[0]->num() / Dtype(2);
-  return loss;
-}
-
-template <typename Dtype>
-void EuclideanLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
-  for (int i = 0; i < 2; ++i) {
-    if (propagate_down[i]) {
-      const Dtype sign = (i == 0) ? 1 : -1;
-      caffe_gpu_axpby(
-          (*bottom)[i]->count(),              // count
-          sign / (*bottom)[i]->num(),         // alpha
-          diff_.gpu_data(),                   // a
-          Dtype(0),                           // beta
-          (*bottom)[i]->mutable_gpu_diff());  // b
-    }
-  }
-}
-
-INSTANTIATE_CLASS(EuclideanLossLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp
index 9494da9a255..d9799db026f 100644
--- a/src/caffe/layers/flatten_layer.cpp
+++ b/src/caffe/layers/flatten_layer.cpp
@@ -21,14 +21,14 @@ void FlattenLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-Dtype FlattenLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype FlattenLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   (*top)[0]->ShareData(*bottom[0]);
   return Dtype(0.);
 }
 
 template <typename Dtype>
-void FlattenLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void FlattenLayer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
   (*bottom)[0]->ShareDiff(*top[0]);
 }
diff --git a/src/caffe/layers/flatten_layer.cu b/src/caffe/layers/flatten_layer.cu
deleted file mode 100644
index 68add383c48..00000000000
--- a/src/caffe/layers/flatten_layer.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-Dtype FlattenLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  (*top)[0]->ShareData(*bottom[0]);
-  return Dtype(0.);
-}
-
-template <typename Dtype>
-void FlattenLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
-  (*bottom)[0]->ShareDiff(*top[0]);
-}
-
-INSTANTIATE_CLASS(FlattenLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 2ba7fa77f45..02121512f16 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -86,7 +86,7 @@ void HDF5DataLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-Dtype HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype HDF5DataLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
   const int data_count = (*top)[0]->count() / (*top)[0]->num();
@@ -104,12 +104,13 @@ Dtype HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       }
       current_row_ = 0;
     }
-    memcpy(&(*top)[0]->mutable_cpu_data()[i * data_count],
-           &data_blob_.cpu_data()[current_row_ * data_count],
-           sizeof(Dtype) * data_count);
-    memcpy(&(*top)[1]->mutable_cpu_data()[i * label_data_count],
-            &label_blob_.cpu_data()[current_row_ * label_data_count],
-            sizeof(Dtype) * label_data_count);
+    DeviceFactory<Dtype>::GetDevice()->copy_from_cpu(
+        data_count, &data_blob_.const_data()[current_row_ * data_count],
+        &(*top)[0]->mutable_data()[i * data_count]);
+    DeviceFactory<Dtype>::GetDevice()->copy_from_cpu(
+        label_data_count,
+        &label_blob_.const_data()[current_row_ * label_data_count],
+        &(*top)[1]->mutable_data()[i * label_data_count]);
   }
   return Dtype(0.);
 }
diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu
deleted file mode 100644
index b2b09ef7dd1..00000000000
--- a/src/caffe/layers/hdf5_data_layer.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-/*
-TODO:
-- only load parts of the file, in accordance with a prototxt param "max_mem"
-*/
-
-#include <stdint.h>
-#include <string>
-#include <vector>
-
-#include "hdf5.h"
-#include "hdf5_hl.h"
-
-#include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/vision_layers.hpp"
-
-using std::string;
-
-namespace caffe {
-
-template <typename Dtype>
-Dtype HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
-  const int data_count = (*top)[0]->count() / (*top)[0]->num();
-  const int label_data_count = (*top)[1]->count() / (*top)[1]->num();
-
-  for (int i = 0; i < batch_size; ++i, ++current_row_) {
-    if (current_row_ == data_blob_.num()) {
-      if (num_files_ > 1) {
-        current_file_ += 1;
-
-        if (current_file_ == num_files_) {
-          current_file_ = 0;
-          LOG(INFO) << "looping around to first file";
-        }
-
-        LoadHDF5FileData(hdf_filenames_[current_file_].c_str());
-      }
-      current_row_ = 0;
-    }
-    CUDA_CHECK(cudaMemcpy(
-            &(*top)[0]->mutable_gpu_data()[i * data_count],
-            &data_blob_.cpu_data()[current_row_ * data_count],
-            sizeof(Dtype) * data_count,
-            cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemcpy(
-            &(*top)[1]->mutable_gpu_data()[i * label_data_count],
-            &label_blob_.cpu_data()[current_row_ * label_data_count],
-            sizeof(Dtype) * label_data_count,
-            cudaMemcpyHostToDevice));
-  }
-  return Dtype(0.);
-}
-
-INSTANTIATE_CLASS(HDF5DataLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp
index 3a513b9c366..cc87e777f81 100644
--- a/src/caffe/layers/hdf5_output_layer.cpp
+++ b/src/caffe/layers/hdf5_output_layer.cpp
@@ -42,7 +42,7 @@ void HDF5OutputLayer<Dtype>::SaveBlobs() {
 }
 
 template <typename Dtype>
-Dtype HDF5OutputLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype HDF5OutputLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   CHECK_GE(bottom.size(), 2);
   CHECK_EQ(bottom[0]->num(), bottom[1]->num());
@@ -54,23 +54,17 @@ Dtype HDF5OutputLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
 
   for (int i = 0; i < bottom[0]->num(); ++i) {
-    memcpy(&data_blob_.mutable_cpu_data()[i * data_datum_dim],
-           &bottom[0]->cpu_data()[i * data_datum_dim],
-           sizeof(Dtype) * data_datum_dim);
-    memcpy(&label_blob_.mutable_cpu_data()[i * label_datum_dim],
-           &bottom[1]->cpu_data()[i * label_datum_dim],
-           sizeof(Dtype) * label_datum_dim);
+    DeviceFactory<Dtype>::GetDevice()->copy_from_cpu(
+        data_datum_dim, &bottom[0]->const_data()[i * data_datum_dim],
+        &data_blob_.mutable_data()[i * data_datum_dim]);
+    DeviceFactory<Dtype>::GetDevice()->copy_from_cpu(
+        label_datum_dim, &bottom[1]->const_data()[i * label_datum_dim],
+        &label_blob_.mutable_data()[i * label_datum_dim]);
   }
   SaveBlobs();
   return Dtype(0.);
 }
 
-template <typename Dtype>
-void HDF5OutputLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
-  return;
-}
-
 INSTANTIATE_CLASS(HDF5OutputLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/hdf5_output_layer.cu b/src/caffe/layers/hdf5_output_layer.cu
deleted file mode 100644
index 59505ee6acf..00000000000
--- a/src/caffe/layers/hdf5_output_layer.cu
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-
-#include <vector>
-
-#include "hdf5.h"
-#include "hdf5_hl.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-using std::vector;
-
-template <typename Dtype>
-Dtype HDF5OutputLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  CHECK_GE(bottom.size(), 2);
-  CHECK_EQ(bottom[0]->num(), bottom[1]->num());
-  data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-                     bottom[0]->height(), bottom[0]->width());
-  label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
-                     bottom[1]->height(), bottom[1]->width());
-  const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
-  const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
-
-  for (int i = 0; i < bottom[0]->num(); ++i) {
-    CUDA_CHECK(cudaMemcpy(&data_blob_.mutable_cpu_data()[i * data_datum_dim],
-           &bottom[0]->gpu_data()[i * data_datum_dim],
-           sizeof(Dtype) * data_datum_dim, cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaMemcpy(&label_blob_.mutable_cpu_data()[i * label_datum_dim],
-           &bottom[1]->gpu_data()[i * label_datum_dim],
-           sizeof(Dtype) * label_datum_dim, cudaMemcpyDeviceToHost));
-  }
-  SaveBlobs();
-  return Dtype(0.);
-}
-
-template <typename Dtype>
-void HDF5OutputLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
-  return;
-}
-
-INSTANTIATE_CLASS(HDF5OutputLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/hinge_loss_layer.cpp b/src/caffe/layers/hinge_loss_layer.cpp
index 8097761d22b..221b4449360 100644
--- a/src/caffe/layers/hinge_loss_layer.cpp
+++ b/src/caffe/layers/hinge_loss_layer.cpp
@@ -15,16 +15,16 @@ using std::max;
 namespace caffe {
 
 template <typename Dtype>
-Dtype HingeLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype HingeLossLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-  const Dtype* label = bottom[1]->cpu_data();
+  const Dtype* bottom_data = bottom[0]->const_data();
+  Dtype* bottom_diff = bottom[0]->mutable_diff();
+  const Dtype* label = bottom[1]->const_data();
   int num = bottom[0]->num();
   int count = bottom[0]->count();
   int dim = count / num;
 
-  caffe_copy(count, bottom_data, bottom_diff);
+  DeviceFactory<Dtype>::GetDevice()->copy(count, bottom_data, bottom_diff);
   for (int i = 0; i < num; ++i) {
     bottom_diff[i * dim + static_cast<int>(label[i])] *= -1;
   }
@@ -35,24 +35,29 @@ Dtype HingeLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   }
   switch (this->layer_param_.hinge_loss_param().norm()) {
   case HingeLossParameter_Norm_L1:
-    return caffe_cpu_asum(count, bottom_diff) / num;
+    Dtype sum;
+    DeviceFactory<Dtype>::GetDevice()->asum(count, bottom_diff, &sum);
+    return sum / num;
   case HingeLossParameter_Norm_L2:
-    return caffe_cpu_dot(count, bottom_diff, bottom_diff) / num;
+    Dtype dot;
+    DeviceFactory<Dtype>::GetDevice()->dot(count, bottom_diff,
+                                           bottom_diff, &dot);
+    return dot / num;
   default:
     LOG(FATAL) << "Unknown Norm";
   }
 }
 
 template <typename Dtype>
-void HingeLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void HingeLossLayer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
   if (propagate_down[1]) {
     LOG(FATAL) << this->type_name()
                << " Layer cannot backpropagate to label inputs.";
   }
   if (propagate_down[0]) {
-    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
-    const Dtype* label = (*bottom)[1]->cpu_data();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_diff();
+    const Dtype* label = (*bottom)[1]->const_data();
     int num = (*bottom)[0]->num();
     int count = (*bottom)[0]->count();
     int dim = count / num;
@@ -63,11 +68,13 @@ void HingeLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
     switch (this->layer_param_.hinge_loss_param().norm()) {
     case HingeLossParameter_Norm_L1:
-      caffe_cpu_sign(count, bottom_diff, bottom_diff);
-      caffe_scal(count, Dtype(1. / num), bottom_diff);
+      DeviceFactory<Dtype>::GetDevice()->sign(count, bottom_diff, bottom_diff);
+      DeviceFactory<Dtype>::GetDevice()->scal(count, Dtype(1. / num),
+                                              bottom_diff);
       break;
     case HingeLossParameter_Norm_L2:
-      caffe_scal(count, Dtype(2. / num), bottom_diff);
+      DeviceFactory<Dtype>::GetDevice()->scal(count, Dtype(2. / num),
+                                              bottom_diff);
       break;
     default:
       LOG(FATAL) << "Unknown Norm";
diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index e047dfb80a7..13048829796 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -25,24 +25,26 @@ void Im2colLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-Dtype Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype Im2colLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  const Dtype* bottom_data = bottom[0]->const_data();
+  Dtype* top_data = (*top)[0]->mutable_data();
   for (int n = 0; n < bottom[0]->num(); ++n) {
-    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+    DeviceFactory<Dtype>::GetDevice()->im2col(
+        bottom_data + bottom[0]->offset(n), channels_, height_,
         width_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n));
   }
   return Dtype(0.);
 }
 
 template <typename Dtype>
-void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void Im2colLayer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
-  const Dtype* top_diff = top[0]->cpu_diff();
-  Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+  const Dtype* top_diff = top[0]->const_diff();
+  Dtype* bottom_diff = (*bottom)[0]->mutable_diff();
   for (int n = 0; n < top[0]->num(); ++n) {
-    col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_,
+    DeviceFactory<Dtype>::GetDevice()->col2im(
+        top_diff + top[0]->offset(n), channels_, height_, width_,
         kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n));
   }
 }
diff --git a/src/caffe/layers/im2col_layer.cu b/src/caffe/layers/im2col_layer.cu
deleted file mode 100644
index 9cfb74e815c..00000000000
--- a/src/caffe/layers/im2col_layer.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/vision_layers.hpp"
-#include "caffe/common.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-Dtype Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = (*top)[0]->mutable_gpu_data();
-  for (int n = 0; n < bottom[0]->num(); ++n) {
-    im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-        width_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n));
-  }
-  return Dtype(0.);
-}
-
-template <typename Dtype>
-void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
-  for (int n = 0; n < top[0]->num(); ++n) {
-    col2im_gpu(top_diff + top[0]->offset(n), channels_, height_, width_,
-        kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n));
-  }
-}
-
-
-INSTANTIATE_CLASS(Im2colLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp
index 1f7368e7e4d..a890090efa6 100644
--- a/src/caffe/layers/image_data_layer.cpp
+++ b/src/caffe/layers/image_data_layer.cpp
@@ -30,8 +30,8 @@ void* ImageDataLayerPrefetch(void* layer_pointer) {
   CHECK(layer);
   Datum datum;
   CHECK(layer->prefetch_data_);
-  Dtype* top_data = layer->prefetch_data_->mutable_cpu_data();
-  Dtype* top_label = layer->prefetch_label_->mutable_cpu_data();
+  Dtype* top_data = layer->prefetch_data_->mutable_data();
+  Dtype* top_label = layer->prefetch_label_->mutable_data();
   ImageDataParameter image_data_param = layer->layer_param_.image_data_param();
   const Dtype scale = image_data_param.scale();
   const int batch_size = image_data_param.batch_size();
@@ -50,7 +50,7 @@ void* ImageDataLayerPrefetch(void* layer_pointer) {
   const int width = layer->datum_width_;
   const int size = layer->datum_size_;
   const int lines_size = layer->lines_.size();
-  const Dtype* mean = layer->data_mean_.cpu_data();
+  const Dtype* mean = layer->data_mean_.const_data();
   for (int item_id = 0; item_id < batch_size; ++item_id) {
     // get a blob
     CHECK_GT(lines_size, layer->lines_id_);
@@ -220,12 +220,12 @@ void ImageDataLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
     data_mean_.Reshape(1, datum_channels_, datum_height_, datum_width_);
   }
   // Now, start the prefetch thread. Before calling prefetch, we make two
-  // cpu_data calls so that the prefetch thread does not accidentally make
+  // const_data calls so that the prefetch thread does not accidentally make
   // simultaneous cudaMalloc calls when the main thread is running. In some
   // GPUs this seems to cause failures if we do not so.
-  prefetch_data_->mutable_cpu_data();
-  prefetch_label_->mutable_cpu_data();
-  data_mean_.cpu_data();
+  prefetch_data_->mutable_data();
+  prefetch_label_->mutable_data();
+  data_mean_.const_data();
   DLOG(INFO) << "Initializing prefetch";
   CreatePrefetchThread();
   DLOG(INFO) << "Prefetch initialized.";
@@ -273,15 +273,17 @@ unsigned int ImageDataLayer<Dtype>::PrefetchRand() {
 }
 
 template <typename Dtype>
-Dtype ImageDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype ImageDataLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   // First, join the thread
   JoinPrefetchThread();
   // Copy the data
-  caffe_copy(prefetch_data_->count(), prefetch_data_->cpu_data(),
-             (*top)[0]->mutable_cpu_data());
-  caffe_copy(prefetch_label_->count(), prefetch_label_->cpu_data(),
-             (*top)[1]->mutable_cpu_data());
+  DeviceFactory<Dtype>::GetDevice()->copy_from_cpu(
+      prefetch_data_->count(), prefetch_data_->const_data(),
+      (*top)[0]->mutable_data());
+  DeviceFactory<Dtype>::GetDevice()->copy_from_cpu(
+      prefetch_label_->count(), prefetch_label_->const_data(),
+      (*top)[1]->mutable_data());
   // Start a new prefetch thread
   CreatePrefetchThread();
   return Dtype(0.);
diff --git a/src/caffe/layers/image_data_layer.cu b/src/caffe/layers/image_data_layer.cu
deleted file mode 100644
index 98047297d80..00000000000
--- a/src/caffe/layers/image_data_layer.cu
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-
-#include <cuda_runtime.h>
-#include <stdint.h>
-#include <leveldb/db.h>
-#include <pthread.h>
-
-#include <string>
-#include <vector>
-#include <iostream>  // NOLINT(readability/streams)
-#include <fstream>  // NOLINT(readability/streams)
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/vision_layers.hpp"
-
-using std::string;
-using std::pair;
-
-namespace caffe {
-
-template <typename Dtype>
-Dtype ImageDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  // First, join the thread
-  JoinPrefetchThread();
-  // Copy the data
-  CUDA_CHECK(cudaMemcpy((*top)[0]->mutable_gpu_data(),
-      prefetch_data_->cpu_data(), sizeof(Dtype) * prefetch_data_->count(),
-      cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy((*top)[1]->mutable_gpu_data(),
-      prefetch_label_->cpu_data(), sizeof(Dtype) * prefetch_label_->count(),
-      cudaMemcpyHostToDevice));
-  // Start a new prefetch thread
-  CreatePrefetchThread();
-  return Dtype(0.);
-}
-
-INSTANTIATE_CLASS(ImageDataLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/infogain_loss_layer.cpp b/src/caffe/layers/infogain_loss_layer.cpp
index a72874e4bb4..8a48d18f498 100644
--- a/src/caffe/layers/infogain_loss_layer.cpp
+++ b/src/caffe/layers/infogain_loss_layer.cpp
@@ -32,11 +32,11 @@ void InfogainLossLayer<Dtype>::FurtherSetUp(
 
 
 template <typename Dtype>
-Dtype InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype InfogainLossLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  const Dtype* bottom_label = bottom[1]->cpu_data();
-  const Dtype* infogain_mat = infogain_.cpu_data();
+  const Dtype* bottom_data = bottom[0]->const_data();
+  const Dtype* bottom_label = bottom[1]->const_data();
+  const Dtype* infogain_mat = infogain_.const_data();
   int num = bottom[0]->num();
   int dim = bottom[0]->count() / bottom[0]->num();
   CHECK_EQ(infogain_.height(), dim);
@@ -49,13 +49,13 @@ Dtype InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     }
   }
   if (top->size() == 1) {
-    (*top)[0]->mutable_cpu_data()[0] = loss / num;
+    (*top)[0]->mutable_data()[0] = loss / num;
   }
   return loss / num;
 }
 
 template <typename Dtype>
-void InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void InfogainLossLayer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   if (propagate_down[1]) {
@@ -63,10 +63,10 @@ void InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
                << " Layer cannot backpropagate to label inputs.";
   }
   if (propagate_down[0]) {
-    const Dtype* bottom_data = (*bottom)[0]->cpu_data();
-    const Dtype* bottom_label = (*bottom)[1]->cpu_data();
-    const Dtype* infogain_mat = infogain_.cpu_data();
-    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+    const Dtype* bottom_data = (*bottom)[0]->const_data();
+    const Dtype* bottom_label = (*bottom)[1]->const_data();
+    const Dtype* infogain_mat = infogain_.const_data();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_diff();
     int num = (*bottom)[0]->num();
     int dim = (*bottom)[0]->count() / (*bottom)[0]->num();
     CHECK_EQ(infogain_.height(), dim);
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index ddf55e49b63..08278c538da 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -49,7 +49,7 @@ void InnerProductLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
   if (bias_term_) {
     bias_multiplier_.reset(new SyncedMemory(M_ * sizeof(Dtype)));
     Dtype* bias_multiplier_data =
-        reinterpret_cast<Dtype*>(bias_multiplier_->mutable_cpu_data());
+        reinterpret_cast<Dtype*>(bias_multiplier_->mutable_data());
     for (int i = 0; i < M_; ++i) {
         bias_multiplier_data[i] = 1.;
     }
@@ -57,41 +57,46 @@ void InnerProductLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-Dtype InnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype InnerProductLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
-  const Dtype* weight = this->blobs_[0]->cpu_data();
-  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1.,
+  const Dtype* bottom_data = bottom[0]->const_data();
+  Dtype* top_data = (*top)[0]->mutable_data();
+  const Dtype* weight = this->blobs_[0]->const_data();
+  DeviceFactory<Dtype>::GetDevice()->gemm(
+      CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1.,
       bottom_data, weight, (Dtype)0., top_data);
   if (bias_term_) {
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,
-        reinterpret_cast<const Dtype*>(bias_multiplier_->cpu_data()),
-        this->blobs_[1]->cpu_data(), (Dtype)1., top_data);
+    DeviceFactory<Dtype>::GetDevice()->gemm(
+        CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,
+        reinterpret_cast<const Dtype*>(bias_multiplier_->const_data()),
+        this->blobs_[1]->const_data(), (Dtype)1., top_data);
   }
   return Dtype(0);
 }
 
 template <typename Dtype>
-void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void InnerProductLayer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down,
     vector<Blob<Dtype>*>* bottom) {
-  const Dtype* top_diff = top[0]->cpu_diff();
-  const Dtype* bottom_data = (*bottom)[0]->cpu_data();
+  const Dtype* top_diff = top[0]->const_diff();
+  const Dtype* bottom_data = (*bottom)[0]->const_data();
   // Gradient with respect to weight
-  caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
-      top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_cpu_diff());
+  DeviceFactory<Dtype>::GetDevice()->gemm(
+      CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
+      top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_diff());
   if (bias_term_) {
     // Gradient with respect to bias
-    caffe_cpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
-        reinterpret_cast<const Dtype*>(bias_multiplier_->cpu_data()), (Dtype)0.,
-        this->blobs_[1]->mutable_cpu_diff());
+    DeviceFactory<Dtype>::GetDevice()->gemv(
+        CblasTrans, M_, N_, (Dtype)1., top_diff,
+        reinterpret_cast<const Dtype*>(bias_multiplier_->const_data()),
+        (Dtype)0., this->blobs_[1]->mutable_diff());
   }
   if (propagate_down[0]) {
     // Gradient with respect to bottom data
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
-        top_diff, this->blobs_[0]->cpu_data(), (Dtype)0.,
-        (*bottom)[0]->mutable_cpu_diff());
+    DeviceFactory<Dtype>::GetDevice()->gemm(
+        CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
+        top_diff, this->blobs_[0]->const_data(), (Dtype)0.,
+        (*bottom)[0]->mutable_diff());
   }
 }
 
diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/inner_product_layer.cu
deleted file mode 100644
index 5b95a57b23b..00000000000
--- a/src/caffe/layers/inner_product_layer.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-
-#include <cublas_v2.h>
-
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-Dtype InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = (*top)[0]->mutable_gpu_data();
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1.,
-      bottom_data, weight, (Dtype)0., top_data);
-  if (bias_term_) {
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,
-        reinterpret_cast<const Dtype*>(bias_multiplier_->gpu_data()),
-        this->blobs_[1]->gpu_data(), (Dtype)1., top_data);
-  }
-  return Dtype(0);
-}
-
-template <typename Dtype>
-void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    vector<Blob<Dtype>*>* bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* bottom_data = (*bottom)[0]->gpu_data();
-  // Gradient with respect to weight
-  caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
-      top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_gpu_diff());
-  if (bias_term_) {
-    // Gradient with respect to bias
-    caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
-        reinterpret_cast<const Dtype*>(bias_multiplier_->gpu_data()),
-        (Dtype)0., this->blobs_[1]->mutable_gpu_diff());
-  }
-  if (propagate_down[0]) {
-    // Gradient with respect to bottom data
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
-        top_diff, this->blobs_[0]->gpu_data(), (Dtype)0.,
-        (*bottom)[0]->mutable_gpu_diff());
-  }
-}
-
-INSTANTIATE_CLASS(InnerProductLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index a86c1d4c59d..7472e60860f 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -114,15 +114,15 @@ Dtype LRNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 Dtype LRNLayer<Dtype>::CrossChannelForward_cpu(
     const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
-  Dtype* scale_data = scale_.mutable_cpu_data();
+  const Dtype* bottom_data = bottom[0]->const_data();
+  Dtype* top_data = (*top)[0]->mutable_data();
+  Dtype* scale_data = scale_.mutable_data();
   // start with the constant value
   for (int i = 0; i < scale_.count(); ++i) {
     scale_data[i] = 1.;
   }
   Blob<Dtype> padded_square(1, channels_ + size_ - 1, height_, width_);
-  Dtype* padded_square_data = padded_square.mutable_cpu_data();
+  Dtype* padded_square_data = padded_square.mutable_data();
   memset(padded_square_data, 0, sizeof(Dtype) * padded_square.count());
   Dtype alpha_over_size = alpha_ / size_;
   // go through the images
@@ -190,17 +190,17 @@ template <typename Dtype>
 void LRNLayer<Dtype>::CrossChannelBackward_cpu(
     const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
     vector<Blob<Dtype>*>* bottom) {
-  const Dtype* top_diff = top[0]->cpu_diff();
-  const Dtype* top_data = top[0]->cpu_data();
-  const Dtype* bottom_data = (*bottom)[0]->cpu_data();
-  const Dtype* scale_data = scale_.cpu_data();
-  Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+  const Dtype* top_diff = top[0]->const_diff();
+  const Dtype* top_data = top[0]->const_data();
+  const Dtype* bottom_data = (*bottom)[0]->const_data();
+  const Dtype* scale_data = scale_.const_data();
+  Dtype* bottom_diff = (*bottom)[0]->mutable_diff();
   Blob<Dtype> padded_ratio(1, channels_ + size_ - 1, height_, width_);
   Blob<Dtype> accum_ratio(1, 1, height_, width_);
-  Dtype* padded_ratio_data = padded_ratio.mutable_cpu_data();
-  Dtype* accum_ratio_data = accum_ratio.mutable_cpu_data();
+  Dtype* padded_ratio_data = padded_ratio.mutable_data();
+  Dtype* accum_ratio_data = accum_ratio.mutable_data();
   // We hack a little bit by using the diff() to store an additional result
-  Dtype* accum_ratio_times_bottom = accum_ratio.mutable_cpu_diff();
+  Dtype* accum_ratio_times_bottom = accum_ratio.mutable_diff();
   memset(padded_ratio_data, 0, sizeof(Dtype) * padded_ratio.count());
   Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_;
 
diff --git a/src/caffe/layers/memory_data_layer.cpp b/src/caffe/layers/memory_data_layer.cpp
index 15eedb317e3..cb8f1d57626 100644
--- a/src/caffe/layers/memory_data_layer.cpp
+++ b/src/caffe/layers/memory_data_layer.cpp
@@ -36,7 +36,7 @@ void MemoryDataLayer<Dtype>::Reset(Dtype* data, Dtype* labels, int n) {
 }
 
 template <typename Dtype>
-Dtype MemoryDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype MemoryDataLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   CHECK(data_) << "MemoryDataLayer needs to be initalized by calling Reset";
   (*top)[0]->set_cpu_data(data_ + pos_ * datum_size_);
diff --git a/src/caffe/layers/multinomial_logistic_loss_layer.cpp b/src/caffe/layers/multinomial_logistic_loss_layer.cpp
index 013d4034240..763ba9f9d80 100644
--- a/src/caffe/layers/multinomial_logistic_loss_layer.cpp
+++ b/src/caffe/layers/multinomial_logistic_loss_layer.cpp
@@ -23,10 +23,10 @@ void MultinomialLogisticLossLayer<Dtype>::FurtherSetUp(
 }
 
 template <typename Dtype>
-Dtype MultinomialLogisticLossLayer<Dtype>::Forward_cpu(
+Dtype MultinomialLogisticLossLayer<Dtype>::Forward(
     const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  const Dtype* bottom_label = bottom[1]->cpu_data();
+  const Dtype* bottom_data = bottom[0]->const_data();
+  const Dtype* bottom_label = bottom[1]->const_data();
   int num = bottom[0]->num();
   int dim = bottom[0]->count() / bottom[0]->num();
   Dtype loss = 0;
@@ -36,13 +36,13 @@ Dtype MultinomialLogisticLossLayer<Dtype>::Forward_cpu(
     loss -= log(prob);
   }
   if (top->size() == 1){
-    (*top)[0]->mutable_cpu_data()[0] = loss / num;
+    (*top)[0]->mutable_data()[0] = loss / num;
   }
   return loss / num;
 }
 
 template <typename Dtype>
-void MultinomialLogisticLossLayer<Dtype>::Backward_cpu(
+void MultinomialLogisticLossLayer<Dtype>::Backward(
     const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   if (propagate_down[1]) {
@@ -50,9 +50,9 @@ void MultinomialLogisticLossLayer<Dtype>::Backward_cpu(
                << " Layer cannot backpropagate to label inputs.";
   }
   if (propagate_down[0]) {
-    const Dtype* bottom_data = (*bottom)[0]->cpu_data();
-    const Dtype* bottom_label = (*bottom)[1]->cpu_data();
-    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+    const Dtype* bottom_data = (*bottom)[0]->const_data();
+    const Dtype* bottom_label = (*bottom)[1]->const_data();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_diff();
     int num = (*bottom)[0]->num();
     int dim = (*bottom)[0]->count() / (*bottom)[0]->num();
     memset(bottom_diff, 0, sizeof(Dtype) * (*bottom)[0]->count());
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index bc002078814..303a5ef4e0a 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -82,8 +82,8 @@ void PoolingLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 Dtype PoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  const Dtype* bottom_data = bottom[0]->const_data();
+  Dtype* top_data = (*top)[0]->mutable_data();
   const int top_count = (*top)[0]->count();
   // We'll output the mask to top[1] if it's of size >1.
   const bool use_top_mask = top->size() > 1;
@@ -95,10 +95,10 @@ Dtype PoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   case PoolingParameter_PoolMethod_MAX:
     // Initialize
     if (use_top_mask) {
-      top_mask = (*top)[1]->mutable_cpu_data();
+      top_mask = (*top)[1]->mutable_data();
       caffe_set(top_count, Dtype(-1), top_mask);
     } else {
-      mask = max_idx_->mutable_cpu_data();
+      mask = max_idx_->mutable_data();
       caffe_set(top_count, -1, mask);
     }
     caffe_set(top_count, Dtype(-FLT_MAX), top_data);
@@ -188,8 +188,8 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   if (!propagate_down[0]) {
     return;
   }
-  const Dtype* top_diff = top[0]->cpu_diff();
-  Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+  const Dtype* top_diff = top[0]->const_diff();
+  Dtype* bottom_diff = (*bottom)[0]->mutable_diff();
   // Different pooling methods. We explicitly do the switch outside the for
   // loop to save time, although this results in more codes.
   caffe_set((*bottom)[0]->count(), Dtype(0), bottom_diff);
@@ -201,9 +201,9 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   case PoolingParameter_PoolMethod_MAX:
     // The main loop
     if (use_top_mask) {
-      top_mask = top[1]->cpu_data();
+      top_mask = top[1]->const_data();
     } else {
-      mask = max_idx_->cpu_data();
+      mask = max_idx_->const_data();
     }
     for (int n = 0; n < top[0]->num(); ++n) {
       for (int c = 0; c < channels_; ++c) {
diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp
index 5ff3392968e..766214ecf01 100644
--- a/src/caffe/layers/power_layer.cpp
+++ b/src/caffe/layers/power_layer.cpp
@@ -23,78 +23,86 @@ void PowerLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 
 // Compute y = (shift + scale * x)^power
 template <typename Dtype>
-Dtype PowerLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype PowerLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  Dtype* top_data = (*top)[0]->mutable_data();
   const int count = bottom[0]->count();
   // Special case where we can ignore the input: scale or power is 0.
   if (diff_scale_ == Dtype(0)) {
     Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_);
-    caffe_set(count, value, top_data);
+    DeviceFactory<Dtype>::GetDevice()->set(count, value, top_data);
     return Dtype(0);
   }
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  caffe_copy(count, bottom_data, top_data);
+  const Dtype* bottom_data = bottom[0]->const_data();
+  DeviceFactory<Dtype>::GetDevice()->copy(count, bottom_data, top_data);
   if (scale_ != Dtype(1)) {
-    caffe_scal(count, scale_, top_data);
+    DeviceFactory<Dtype>::GetDevice()->scal(count, scale_, top_data);
   }
   if (shift_ != Dtype(0)) {
-    caffe_add_scalar(count, shift_, top_data);
+    DeviceFactory<Dtype>::GetDevice()->add_scalar(count, shift_, top_data);
   }
   if (power_ != Dtype(1)) {
-    caffe_powx(count, top_data, power_, top_data);
+    DeviceFactory<Dtype>::GetDevice()->powx(count, top_data, power_, top_data);
   }
   return Dtype(0);
 }
 
 template <typename Dtype>
-void PowerLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void PowerLayer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   if (propagate_down[0]) {
-    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_diff();
     const int count = (*bottom)[0]->count();
-    const Dtype* top_diff = top[0]->cpu_diff();
+    const Dtype* top_diff = top[0]->const_diff();
     if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) {
-      caffe_set(count, diff_scale_, bottom_diff);
+      DeviceFactory<Dtype>::GetDevice()->set(count, diff_scale_, bottom_diff);
     } else {
-      const Dtype* bottom_data = (*bottom)[0]->cpu_data();
+      const Dtype* bottom_data = (*bottom)[0]->const_data();
       // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1)
       //               = diff_scale * y / (shift + scale * x)
       if (power_ == Dtype(2)) {
         // Special case for y = (shift + scale * x)^2
         //     -> dy/dx = 2 * scale * (shift + scale * x)
         //              = diff_scale * shift + diff_scale * scale * x
-        caffe_cpu_axpby(count, diff_scale_ * scale_, bottom_data,
+        DeviceFactory<Dtype>::GetDevice()->axpby(
+            count, diff_scale_ * scale_, bottom_data,
             Dtype(0), bottom_diff);
         if (shift_ != Dtype(0)) {
-          caffe_add_scalar(count, diff_scale_ * shift_, bottom_diff);
+          DeviceFactory<Dtype>::GetDevice()->add_scalar(
+              count, diff_scale_ * shift_, bottom_diff);
         }
       } else if (shift_ == Dtype(0)) {
         // Special case for y = (scale * x)^power
         //     -> dy/dx = scale * power * (scale * x)^(power - 1)
         //              = scale * power * (scale * x)^power * (scale * x)^(-1)
         //              = power * y / x
-        const Dtype* top_data = top[0]->cpu_data();
-        caffe_div(count, top_data, bottom_data, bottom_diff);
-        caffe_scal(count, power_, bottom_diff);
+        const Dtype* top_data = top[0]->const_data();
+        DeviceFactory<Dtype>::GetDevice()->div(
+            count, top_data, bottom_data, bottom_diff);
+        DeviceFactory<Dtype>::GetDevice()->scal(count, power_, bottom_diff);
       } else {
-        caffe_copy(count, bottom_data, bottom_diff);
+        DeviceFactory<Dtype>::GetDevice()->copy(count, bottom_data,
+                                                bottom_diff);
         if (scale_ != Dtype(1)) {
-          caffe_scal(count, scale_, bottom_diff);
+          DeviceFactory<Dtype>::GetDevice()->scal(count, scale_, bottom_diff);
         }
         if (shift_ != Dtype(0)) {
-          caffe_add_scalar(count, shift_, bottom_diff);
+          DeviceFactory<Dtype>::GetDevice()->add_scalar(count, shift_,
+                                                        bottom_diff);
         }
-        const Dtype* top_data = top[0]->cpu_data();
-        caffe_div<Dtype>(count, top_data, bottom_diff, bottom_diff);
+        const Dtype* top_data = top[0]->const_data();
+        DeviceFactory<Dtype>::GetDevice()->div(count, top_data, bottom_diff,
+                                               bottom_diff);
         if (diff_scale_ != Dtype(1)) {
-          caffe_scal(count, diff_scale_, bottom_diff);
+          DeviceFactory<Dtype>::GetDevice()->scal(count, diff_scale_,
+                                                  bottom_diff);
         }
       }
     }
     if (diff_scale_ != Dtype(0)) {
-      caffe_mul(count, top_diff, bottom_diff, bottom_diff);
+      DeviceFactory<Dtype>::GetDevice()->mul(count, top_diff, bottom_diff,
+                                             bottom_diff);
     }
   }
 }
diff --git a/src/caffe/layers/power_layer.cu b/src/caffe/layers/power_layer.cu
deleted file mode 100644
index 6d699636e21..00000000000
--- a/src/caffe/layers/power_layer.cu
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-#include "caffe/util/math_functions.hpp"
-
-using std::max;
-
-namespace caffe {
-
-template <typename Dtype>
-Dtype PowerLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    vector<Blob<Dtype>*>* top) {
-  Dtype* top_data = (*top)[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // Special case where we can ignore the input: scale or power is 0.
-  if (diff_scale_ == Dtype(0)) {
-    Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_);
-    caffe_gpu_set(count, value, top_data);
-    return Dtype(0);
-  }
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  caffe_gpu_copy(count, bottom_data, top_data);
-  if (scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, scale_, top_data);
-  }
-  if (shift_ != Dtype(0)) {
-    caffe_gpu_add_scalar(count, shift_, top_data);
-  }
-  if (power_ != Dtype(1)) {
-    caffe_gpu_powx(count, top_data, power_, top_data);
-  }
-  return Dtype(0);
-}
-
-template <typename Dtype>
-void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    vector<Blob<Dtype>*>* bottom) {
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
-    const int count = (*bottom)[0]->count();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) {
-      caffe_gpu_set(count, diff_scale_, bottom_diff);
-    } else {
-      const Dtype* bottom_data = (*bottom)[0]->gpu_data();
-      // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1)
-      //               = diff_scale * y / (shift + scale * x)
-      if (power_ == Dtype(2)) {
-        // Special case for y = (shift + scale * x)^2
-        //     -> dy/dx = 2 * scale * (shift + scale * x)
-        //              = diff_scale * shift + diff_scale * scale * x
-        caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data,
-            Dtype(0), bottom_diff);
-        if (shift_ != Dtype(0)) {
-          caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff);
-        }
-      } else if (shift_ == Dtype(0)) {
-        // Special case for y = (scale * x)^power
-        //     -> dy/dx = scale * power * (scale * x)^(power - 1)
-        //              = scale * power * (scale * x)^power * (scale * x)^(-1)
-        //              = power * y / x
-        const Dtype* top_data = top[0]->gpu_data();
-        caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
-        caffe_gpu_scal(count, power_, bottom_diff);
-      } else {
-        caffe_gpu_copy(count, bottom_data, bottom_diff);
-        if (scale_ != Dtype(1)) {
-          caffe_gpu_scal(count, scale_, bottom_diff);
-        }
-        if (shift_ != Dtype(0)) {
-          caffe_gpu_add_scalar(count, shift_, bottom_diff);
-        }
-        const Dtype* top_data = top[0]->gpu_data();
-        caffe_gpu_div<Dtype>(count, top_data, bottom_diff, bottom_diff);
-        if (diff_scale_ != Dtype(1)) {
-          caffe_gpu_scal(count, diff_scale_, bottom_diff);
-        }
-      }
-    }
-    caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
-  }
-}
-
-INSTANTIATE_CLASS(PowerLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
index d7a8509b247..6efda981228 100644
--- a/src/caffe/layers/relu_layer.cpp
+++ b/src/caffe/layers/relu_layer.cpp
@@ -13,8 +13,8 @@ namespace caffe {
 template <typename Dtype>
 Dtype ReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  const Dtype* bottom_data = bottom[0]->const_data();
+  Dtype* top_data = (*top)[0]->mutable_data();
   const int count = bottom[0]->count();
   for (int i = 0; i < count; ++i) {
     top_data[i] = max(bottom_data[i], Dtype(0));
@@ -27,9 +27,9 @@ void ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   if (propagate_down[0]) {
-    const Dtype* bottom_data = (*bottom)[0]->cpu_data();
-    const Dtype* top_diff = top[0]->cpu_diff();
-    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+    const Dtype* bottom_data = (*bottom)[0]->const_data();
+    const Dtype* top_diff = top[0]->const_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_diff();
     const int count = (*bottom)[0]->count();
     for (int i = 0; i < count; ++i) {
       bottom_diff[i] = top_diff[i] * (bottom_data[i] > 0);
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
index 8cb830ff248..fcc3a19a14d 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
@@ -25,7 +25,7 @@ void SigmoidCrossEntropyLossLayer<Dtype>::FurtherSetUp(
 }
 
 template <typename Dtype>
-Dtype SigmoidCrossEntropyLossLayer<Dtype>::Forward_cpu(
+Dtype SigmoidCrossEntropyLossLayer<Dtype>::Forward(
     const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
   // The forward pass computes the sigmoid outputs.
   sigmoid_bottom_vec_[0] = bottom[0];
@@ -34,21 +34,21 @@ Dtype SigmoidCrossEntropyLossLayer<Dtype>::Forward_cpu(
   const int count = bottom[0]->count();
   const int num = bottom[0]->num();
   // Stable version of loss computation from input data
-  const Dtype* input_data = bottom[0]->cpu_data();
-  const Dtype* target = bottom[1]->cpu_data();
+  const Dtype* input_data = bottom[0]->const_data();
+  const Dtype* target = bottom[1]->const_data();
   Dtype loss = 0;
   for (int i = 0; i < count; ++i) {
     loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) -
         log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
   }
   if (top->size() == 1) {
-    (*top)[0]->mutable_cpu_data()[0] = loss / num;
+    (*top)[0]->mutable_data()[0] = loss / num;
   }
   return loss / num;
 }
 
 template <typename Dtype>
-void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
+void SigmoidCrossEntropyLossLayer<Dtype>::Backward(
     const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   if (propagate_down[1]) {
@@ -59,12 +59,14 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
     // First, compute the diff
     const int count = (*bottom)[0]->count();
     const int num = (*bottom)[0]->num();
-    const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data();
-    const Dtype* target = (*bottom)[1]->cpu_data();
-    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
-    caffe_sub(count, sigmoid_output_data, target, bottom_diff);
+    const Dtype* sigmoid_output_data = sigmoid_output_->const_data();
+    const Dtype* target = (*bottom)[1]->const_data();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_diff();
+    DeviceFactory<Dtype>::GetDevice()->sub(
+        count, sigmoid_output_data, target, bottom_diff);
     // Scale down gradient
-    caffe_scal(count, Dtype(1) / num, bottom_diff);
+    DeviceFactory<Dtype>::GetDevice()->scal(count, Dtype(1) / num,
+                                            bottom_diff);
   }
 }
 
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
deleted file mode 100644
index 8f7275827e2..00000000000
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-#include "caffe/util/math_functions.hpp"
-
-using std::max;
-
-namespace caffe {
-
-template <typename Dtype>
-Dtype SigmoidCrossEntropyLossLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
-  // The forward pass computes the sigmoid outputs.
-  sigmoid_bottom_vec_[0] = bottom[0];
-  sigmoid_layer_->Forward(sigmoid_bottom_vec_, &sigmoid_top_vec_);
-  // Compute the loss (negative log likelihood)
-  const int count = bottom[0]->count();
-  const int num = bottom[0]->num();
-  // Stable version of loss computation from input data
-  const Dtype* input_data = bottom[0]->cpu_data();
-  const Dtype* target = bottom[1]->cpu_data();
-  Dtype loss = 0;
-  for (int i = 0; i < count; ++i) {
-    loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) -
-        log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
-  }
-  if (top->size() == 1) {
-    (*top)[0]->mutable_cpu_data()[0] = loss / num;
-  }
-  return loss / num;
-}
-
-template <typename Dtype>
-void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    vector<Blob<Dtype>*>* bottom) {
-  if (propagate_down[1]) {
-    LOG(FATAL) << this->type_name()
-               << " Layer cannot backpropagate to label inputs.";
-  }
-  if (propagate_down[0]) {
-    // First, compute the diff
-    const int count = (*bottom)[0]->count();
-    const int num = (*bottom)[0]->num();
-    const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data();
-    const Dtype* target = (*bottom)[1]->gpu_data();
-    Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
-    caffe_gpu_copy(count, sigmoid_output_data, bottom_diff);
-    caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff);
-    // Scale down gradient
-    caffe_gpu_scal(count, Dtype(1) / num, bottom_diff);
-  }
-}
-
-INSTANTIATE_CLASS(SigmoidCrossEntropyLossLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp
index 50139d863dd..8d8afd3e356 100644
--- a/src/caffe/layers/sigmoid_layer.cpp
+++ b/src/caffe/layers/sigmoid_layer.cpp
@@ -17,8 +17,8 @@ inline Dtype sigmoid(Dtype x) {
 template <typename Dtype>
 Dtype SigmoidLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  const Dtype* bottom_data = bottom[0]->const_data();
+  Dtype* top_data = (*top)[0]->mutable_data();
   const int count = bottom[0]->count();
   for (int i = 0; i < count; ++i) {
     top_data[i] = sigmoid(bottom_data[i]);
@@ -31,9 +31,9 @@ void SigmoidLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   if (propagate_down[0]) {
-    const Dtype* top_data = top[0]->cpu_data();
-    const Dtype* top_diff = top[0]->cpu_diff();
-    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+    const Dtype* top_data = top[0]->const_data();
+    const Dtype* top_diff = top[0]->const_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_diff();
     const int count = (*bottom)[0]->count();
     for (int i = 0; i < count; ++i) {
       const Dtype sigmoid_x = top_data[i];
diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp
index 57847d005f6..14548c94c4d 100644
--- a/src/caffe/layers/softmax_layer.cpp
+++ b/src/caffe/layers/softmax_layer.cpp
@@ -19,7 +19,7 @@ void SoftmaxLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
       bottom[0]->height(), bottom[0]->width());
   sum_multiplier_.Reshape(1, bottom[0]->channels(),
       bottom[0]->height(), bottom[0]->width());
-  Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data();
+  Dtype* multiplier_data = sum_multiplier_.mutable_data();
   for (int i = 0; i < sum_multiplier_.count(); ++i) {
     multiplier_data[i] = 1.;
   }
@@ -29,9 +29,9 @@ void SoftmaxLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 Dtype SoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
-  Dtype* scale_data = scale_.mutable_cpu_data();
+  const Dtype* bottom_data = bottom[0]->const_data();
+  Dtype* top_data = (*top)[0]->mutable_data();
+  Dtype* scale_data = scale_.mutable_data();
   int num = bottom[0]->num();
   int dim = bottom[0]->count() / bottom[0]->num();
   memcpy(top_data, bottom_data, sizeof(Dtype) * bottom[0]->count());
@@ -45,12 +45,12 @@ Dtype SoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   }
   // subtraction
   caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-    scale_data, sum_multiplier_.cpu_data(), 1., top_data);
+    scale_data, sum_multiplier_.const_data(), 1., top_data);
   // Perform exponentiation
   caffe_exp<Dtype>(num * dim, top_data, top_data);
   // sum after exp
   caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., top_data,
-      sum_multiplier_.cpu_data(), 0., scale_data);
+      sum_multiplier_.const_data(), 0., scale_data);
   // Do division
   for (int i = 0; i < num; ++i) {
     caffe_scal<Dtype>(dim, Dtype(1.) / scale_data[i], top_data + i * dim);
@@ -62,10 +62,10 @@ template <typename Dtype>
 void SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down,
     vector<Blob<Dtype>*>* bottom) {
-  const Dtype* top_diff = top[0]->cpu_diff();
-  const Dtype* top_data = top[0]->cpu_data();
-  Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
-  Dtype* scale_data = scale_.mutable_cpu_data();
+  const Dtype* top_diff = top[0]->const_diff();
+  const Dtype* top_data = top[0]->const_data();
+  Dtype* bottom_diff = (*bottom)[0]->mutable_diff();
+  Dtype* scale_data = scale_.mutable_data();
   int num = top[0]->num();
   int dim = top[0]->count() / top[0]->num();
   memcpy(bottom_diff, top_diff, sizeof(Dtype) * top[0]->count());
@@ -76,7 +76,7 @@ void SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
   // subtraction
   caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-      scale_data, sum_multiplier_.cpu_data(), 1., bottom_diff);
+      scale_data, sum_multiplier_.const_data(), 1., bottom_diff);
   // elementwise multiplication
   caffe_mul<Dtype>(top[0]->count(), bottom_diff, top_data, bottom_diff);
 }
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index 1a3601aa9e6..19c04f874aa 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -32,13 +32,13 @@ void SoftmaxWithLossLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-Dtype SoftmaxWithLossLayer<Dtype>::Forward_cpu(
+Dtype SoftmaxWithLossLayer<Dtype>::Forward(
     const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
   // The forward pass computes the softmax prob values.
   softmax_bottom_vec_[0] = bottom[0];
   softmax_layer_->Forward(softmax_bottom_vec_, &softmax_top_vec_);
-  const Dtype* prob_data = prob_.cpu_data();
-  const Dtype* label = bottom[1]->cpu_data();
+  const Dtype* prob_data = prob_.const_data();
+  const Dtype* label = bottom[1]->const_data();
   int num = prob_.num();
   int dim = prob_.count() / num;
   Dtype loss = 0;
@@ -47,7 +47,7 @@ Dtype SoftmaxWithLossLayer<Dtype>::Forward_cpu(
                      Dtype(FLT_MIN)));
   }
   if (top->size() >= 1) {
-    (*top)[0]->mutable_cpu_data()[0] = loss / num;
+    (*top)[0]->mutable_data()[0] = loss / num;
   }
   if (top->size() == 2) {
     (*top)[1]->ShareData(prob_);
@@ -56,7 +56,7 @@ Dtype SoftmaxWithLossLayer<Dtype>::Forward_cpu(
 }
 
 template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void SoftmaxWithLossLayer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   if (propagate_down[1]) {
@@ -64,17 +64,18 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
                << " Layer cannot backpropagate to label inputs.";
   }
   if (propagate_down[0]) {
-    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
-    const Dtype* prob_data = prob_.cpu_data();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_diff();
+    const Dtype* prob_data = prob_.const_data();
     memcpy(bottom_diff, prob_data, sizeof(Dtype) * prob_.count());
-    const Dtype* label = (*bottom)[1]->cpu_data();
+    const Dtype* label = (*bottom)[1]->const_data();
     int num = prob_.num();
     int dim = prob_.count() / num;
     for (int i = 0; i < num; ++i) {
       bottom_diff[i * dim + static_cast<int>(label[i])] -= 1;
     }
     // Scale down gradient
-    caffe_scal(prob_.count(), Dtype(1) / num, bottom_diff);
+    DeviceFactory<Dtype>::GetDevice()->scal(prob_.count(), Dtype(1) / num,
+                                            bottom_diff);
   }
 }
 
diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/softmax_loss_layer.cu
deleted file mode 100644
index e46be6ba85d..00000000000
--- a/src/caffe/layers/softmax_loss_layer.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-#include "caffe/util/math_functions.hpp"
-
-using std::max;
-
-namespace caffe {
-
-template <typename Dtype>
-Dtype SoftmaxWithLossLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
-  // The forward pass computes the softmax prob values.
-  return Forward_cpu(bottom, top);
-}
-
-template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
-  // TODO(Yangqing): implement the GPU version of softmax.
-  Backward_cpu(top, propagate_down, bottom);
-}
-
-INSTANTIATE_CLASS(SoftmaxWithLossLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp
index 28abd95f5ff..f90ea678b95 100644
--- a/src/caffe/layers/split_layer.cpp
+++ b/src/caffe/layers/split_layer.cpp
@@ -27,7 +27,7 @@ void SplitLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-Dtype SplitLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype SplitLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   for (int i = 0; i < top->size(); ++i) {
     (*top)[i]->ShareData(*bottom[0]);
@@ -36,15 +36,16 @@ Dtype SplitLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+void SplitLayer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
   if (propagate_down[0]) {
     (*bottom)[0]->ShareDiff(*top[0]);
     // Add remaining top blob diffs.
-    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_diff();
     for (int i = 1; i < top.size(); ++i) {
-      const Dtype* top_diff = top[i]->cpu_diff();
-      caffe_axpy(count_, Dtype(1.), top_diff, bottom_diff);
+      const Dtype* top_diff = top[i]->const_diff();
+      DeviceFactory<Dtype>::GetDevice()->axpy(
+          count_, Dtype(1.), top_diff, bottom_diff);
     }
   }
 }
diff --git a/src/caffe/layers/split_layer.cu b/src/caffe/layers/split_layer.cu
deleted file mode 100644
index 4c921d39f17..00000000000
--- a/src/caffe/layers/split_layer.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-Dtype SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  for (int i = 0; i < top->size(); ++i) {
-    (*top)[i]->ShareData(*bottom[0]);
-  }
-  return Dtype(0.);
-}
-
-template <typename Dtype>
-void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
-  if (propagate_down[0]) {
-    (*bottom)[0]->ShareDiff(*top[0]);
-    // Add remaining top blob diffs.
-    Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
-    for (int i = 1; i < top.size(); ++i) {
-      const Dtype* top_diff = top[i]->gpu_diff();
-      caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff);
-    }
-  }
-}
-
-
-INSTANTIATE_CLASS(SplitLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp
index 6b5166d53e9..8dc6ba7b06d 100644
--- a/src/caffe/layers/tanh_layer.cpp
+++ b/src/caffe/layers/tanh_layer.cpp
@@ -13,8 +13,8 @@ namespace caffe {
 template <typename Dtype>
 Dtype TanHLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  const Dtype* bottom_data = bottom[0]->const_data();
+  Dtype* top_data = (*top)[0]->mutable_data();
   Dtype exp2x;
   const int count = bottom[0]->count();
   for (int i = 0; i < count; ++i) {
@@ -29,9 +29,9 @@ void TanHLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down,
     vector<Blob<Dtype>*>* bottom) {
   if (propagate_down[0]) {
-    const Dtype* top_data = top[0]->cpu_data();
-    const Dtype* top_diff = top[0]->cpu_diff();
-    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+    const Dtype* top_data = top[0]->const_data();
+    const Dtype* top_diff = top[0]->const_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_diff();
     const int count = (*bottom)[0]->count();
     Dtype tanhx;
     for (int i = 0; i < count; ++i) {
diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp
index e6ed8a6b40e..47d58589064 100644
--- a/src/caffe/layers/threshold_layer.cpp
+++ b/src/caffe/layers/threshold_layer.cpp
@@ -18,8 +18,8 @@ void ThresholdLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 Dtype ThresholdLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  const Dtype* bottom_data = bottom[0]->const_data();
+  Dtype* top_data = (*top)[0]->mutable_data();
   const int count = bottom[0]->count();
   for (int i = 0; i < count; ++i) {
     top_data[i] = (bottom_data[i] > threshold_) ? Dtype(1) : Dtype(0);
diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp
index fd4860f98be..a068ff008b4 100644
--- a/src/caffe/layers/window_data_layer.cpp
+++ b/src/caffe/layers/window_data_layer.cpp
@@ -40,8 +40,8 @@ void* WindowDataLayerPrefetch(void* layer_pointer) {
   // At each iteration, sample N windows where N*p are foreground (object)
   // windows and N*(1-p) are background (non-object) windows
 
-  Dtype* top_data = layer->prefetch_data_->mutable_cpu_data();
-  Dtype* top_label = layer->prefetch_label_->mutable_cpu_data();
+  Dtype* top_data = layer->prefetch_data_->mutable_data();
+  Dtype* top_label = layer->prefetch_label_->mutable_data();
   const Dtype scale = layer->layer_param_.window_data_param().scale();
   const int batch_size = layer->layer_param_.window_data_param().batch_size();
   const int crop_size = layer->layer_param_.window_data_param().crop_size();
@@ -49,7 +49,7 @@ void* WindowDataLayerPrefetch(void* layer_pointer) {
   const bool mirror = layer->layer_param_.window_data_param().mirror();
   const float fg_fraction =
       layer->layer_param_.window_data_param().fg_fraction();
-  const Dtype* mean = layer->data_mean_.cpu_data();
+  const Dtype* mean = layer->data_mean_.const_data();
   const int mean_off = (layer->data_mean_.width() - crop_size) / 2;
   const int mean_width = layer->data_mean_.width();
   const int mean_height = layer->data_mean_.height();
@@ -398,12 +398,12 @@ void WindowDataLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
     data_mean_.Reshape(1, channels, crop_size, crop_size);
   }
   // Now, start the prefetch thread. Before calling prefetch, we make two
-  // cpu_data calls so that the prefetch thread does not accidentally make
+  // const_data calls so that the prefetch thread does not accidentally make
   // simultaneous cudaMalloc calls when the main thread is running. In some
   // GPUs this seems to cause failures if we do not so.
-  prefetch_data_->mutable_cpu_data();
-  prefetch_label_->mutable_cpu_data();
-  data_mean_.cpu_data();
+  prefetch_data_->mutable_data();
+  prefetch_label_->mutable_data();
+  data_mean_.const_data();
   DLOG(INFO) << "Initializing prefetch";
   CreatePrefetchThread();
   DLOG(INFO) << "Prefetch initialized.";
@@ -439,15 +439,17 @@ unsigned int WindowDataLayer<Dtype>::PrefetchRand() {
 }
 
 template <typename Dtype>
-Dtype WindowDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+Dtype WindowDataLayer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   // First, join the thread
   JoinPrefetchThread();
   // Copy the data
-  caffe_copy(prefetch_data_->count(), prefetch_data_->cpu_data(),
-             (*top)[0]->mutable_cpu_data());
-  caffe_copy(prefetch_label_->count(), prefetch_label_->cpu_data(),
-             (*top)[1]->mutable_cpu_data());
+  DeviceFactory<Dtype>::GetDevice()->copy_from_cpu(
+      prefetch_data_->count(), prefetch_data_->const_data(),
+      (*top)[0]->mutable_data());
+  DeviceFactory<Dtype>::GetDevice()->copy_from_cpu(
+      prefetch_label_->count(), prefetch_label_->const_data(),
+      (*top)[1]->mutable_data());
   // Start a new prefetch thread
   CreatePrefetchThread();
   return Dtype(0.);
diff --git a/src/caffe/layers/window_data_layer.cu b/src/caffe/layers/window_data_layer.cu
deleted file mode 100644
index bc49fef6545..00000000000
--- a/src/caffe/layers/window_data_layer.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2014 BVLC and contributors.
-//
-// Based on data_layer.cpp by Yangqing Jia.
-
-#include <stdint.h>
-#include <pthread.h>
-
-#include <string>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/vision_layers.hpp"
-
-using std::string;
-using std::map;
-using std::pair;
-
-// caffe.proto > LayerParameter > WindowDataParameter
-//   'source' field specifies the window_file
-//   'crop_size' indicates the desired warped size
-
-namespace caffe {
-
-template <typename Dtype>
-Dtype WindowDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  // First, join the thread
-  JoinPrefetchThread();
-  // Copy the data
-  CUDA_CHECK(cudaMemcpy((*top)[0]->mutable_gpu_data(),
-      prefetch_data_->cpu_data(), sizeof(Dtype) * prefetch_data_->count(),
-      cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy((*top)[1]->mutable_gpu_data(),
-      prefetch_label_->cpu_data(), sizeof(Dtype) * prefetch_label_->count(),
-      cudaMemcpyHostToDevice));
-  // Start a new prefetch thread
-  CreatePrefetchThread();
-  return Dtype(0.);
-}
-
-INSTANTIATE_CLASS(WindowDataLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index f364e6767c6..30c823f843a 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -482,20 +482,10 @@ void Net<Dtype>::Update() {
     const int count = params_[i]->count();
     const Dtype* this_diff;
     Dtype* owner_diff;
-    switch (Caffe::mode()) {
-    case Caffe::CPU:
-      this_diff = params_[i]->cpu_diff();
-      owner_diff = params_[param_owners_[i]]->mutable_cpu_diff();
-      caffe_add(count, this_diff, owner_diff, owner_diff);
-      break;
-    case Caffe::GPU:
-      this_diff = params_[i]->gpu_diff();
-      owner_diff = params_[param_owners_[i]]->mutable_gpu_diff();
-      caffe_gpu_add(count, this_diff, owner_diff, owner_diff);
-      break;
-    default:
-      LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-    }
+    this_diff = params_[i]->const_diff();
+    owner_diff = params_[param_owners_[i]]->mutable_diff();
+    DeviceFactory<Dtype>::GetDevice()->add(count, this_diff, owner_diff,
+                                           owner_diff);
   }
   // Now, update the owned parameters.
   for (int i = 0; i < params_.size(); ++i) {
diff --git a/src/caffe/opencl_syncedmem.cpp b/src/caffe/opencl_syncedmem.cpp
new file mode 100644
index 00000000000..f14d4e8177d
--- /dev/null
+++ b/src/caffe/opencl_syncedmem.cpp
@@ -0,0 +1,133 @@
+// Copyright 2014 BVLC and contributors.
+
+#ifdef USE_OPENCL
+#include <cstring>
+
+#include "caffe/common.hpp"
+#include "caffe/opencl_syncedmem.hpp"
+
+namespace caffe {
+
+OpenCLSyncedMemory::~OpenCLSyncedMemory() {
+  if (shared_host_ptr_ && this->own_cpu_data_) {
+    opencl_aligned_free(shared_host_ptr_);
+    shared_host_ptr_ = NULL;
+  }
+
+  if (mapped_device_ptr_) {
+    CL_CHECK(clReleaseMemObject(device_mem_));
+    free(mapped_device_ptr_);
+    mapped_device_ptr_ = NULL;
+  }
+}
+
+inline void OpenCLSyncedMemory::to_cpu() {
+  switch (this->head_) {
+  case UNINITIALIZED:
+    opencl_aligned_malloc(&shared_host_ptr_, &(this->size_));
+    memset(shared_host_ptr_, 0, this->size_);
+    this->head_ = HEAD_AT_CPU;
+    this->own_cpu_data_ = true;
+    break;
+  case HEAD_AT_GPU:
+    if (shared_host_ptr_ == NULL) {
+      opencl_aligned_malloc(&shared_host_ptr_, &(this->size_));
+      this->own_cpu_data_ = true;
+    }
+    CL_CHECK(clEnqueueReadBuffer(
+        CaffeOpenCL::queue(), device_mem_, CL_TRUE, 0,
+        this->size_, shared_host_ptr_, 0, NULL, NULL));
+    this->head_ = SYNCED;
+    break;
+  case HEAD_AT_CPU:
+  case SYNCED:
+    break;
+  }
+}
+
+inline void OpenCLSyncedMemory::to_gpu() {
+  switch (this->head_) {
+  case UNINITIALIZED:
+/*
+ * http://streamcomputing.eu/blog/2013-02-03/opencl-basics-flags-for-the-creating-memory-objects/
+ */
+    opencl_aligned_malloc(&shared_host_ptr_, &(this->size_));
+    cl_int error;
+    device_mem_ = clCreateBuffer(
+        CaffeOpenCL::context(), CL_MEM_USE_HOST_PTR,
+        this->size_, shared_host_ptr_, &error);
+    CL_CHECK(error);
+    this->head_ = HEAD_AT_GPU;
+    break;
+  case HEAD_AT_CPU:
+    if (mapped_device_ptr_ == NULL) {
+      cl_int error;
+      device_mem_ = clCreateBuffer(
+          CaffeOpenCL::context(), CL_MEM_USE_HOST_PTR,
+          this->size_, shared_host_ptr_, &error);
+      CL_CHECK(error);
+      mapped_device_ptr_ = clEnqueueMapBuffer(
+          CaffeOpenCL::queue(), device_mem_, CL_TRUE,
+          CL_MAP_READ | CL_MAP_WRITE, 0, this->size_, 0, NULL, NULL, &error);
+      CL_CHECK(error);
+    }
+    CL_CHECK(clEnqueueWriteBuffer(
+        CaffeOpenCL::queue(), device_mem_, CL_TRUE, 0,
+        this->size_, shared_host_ptr_, 0, NULL, NULL));
+    this->head_ = SYNCED;
+    break;
+  case HEAD_AT_GPU:
+  case SYNCED:
+    break;
+  }
+}
+
+const void* OpenCLSyncedMemory::cpu_data() {
+  to_cpu();
+  return (const void*)shared_host_ptr_;
+}
+
+void OpenCLSyncedMemory::set_cpu_data(void* data) {
+  CHECK(data);
+  if (this->own_cpu_data_) {
+    CaffeFreeHost(shared_host_ptr_);
+  }
+  shared_host_ptr_ = data;
+  this->head_ = HEAD_AT_CPU;
+  this->own_cpu_data_ = false;
+}
+
+const void* OpenCLSyncedMemory::gpu_data() {
+  to_gpu();
+  cl_int error;
+  mapped_device_ptr_ = clEnqueueMapBuffer(
+      CaffeOpenCL::queue(), device_mem_, CL_TRUE,
+      CL_MAP_WRITE, 0, this->size_, 0, NULL, NULL, &error);
+  CL_CHECK(error);
+  CL_CHECK(clEnqueueUnmapMemObject(
+      CaffeOpenCL::queue(), device_mem_, mapped_device_ptr_,
+      0, NULL, NULL));
+  return (const void*)(mapped_device_ptr_);
+}
+
+void* OpenCLSyncedMemory::mutable_cpu_data() {
+  to_cpu();
+  return shared_host_ptr_;
+}
+
+void* OpenCLSyncedMemory::mutable_gpu_data() {
+  to_gpu();
+  cl_int error;
+  mapped_device_ptr_ = clEnqueueMapBuffer(
+      CaffeOpenCL::queue(), device_mem_, CL_TRUE,
+      CL_MAP_READ | CL_MAP_WRITE, 0, this->size_, 0, NULL, NULL, &error);
+  CL_CHECK(error);
+  CL_CHECK(clEnqueueUnmapMemObject(
+      CaffeOpenCL::queue(), device_mem_, mapped_device_ptr_,
+      0, NULL, NULL));
+  return mapped_device_ptr_;
+}
+
+
+}  // namespace caffe
+#endif  // USE_OPENCL
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 769618175ac..3642bf9b40e 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -153,7 +153,7 @@ void Solver<Dtype>::Test(const int test_net_id) {
     }
     if (i == 0) {
       for (int j = 0; j < result.size(); ++j) {
-        const Dtype* result_vec = result[j]->cpu_data();
+        const Dtype* result_vec = result[j]->const_data();
         for (int k = 0; k < result[j]->count(); ++k) {
           test_score.push_back(result_vec[k]);
         }
@@ -161,7 +161,7 @@ void Solver<Dtype>::Test(const int test_net_id) {
     } else {
       int idx = 0;
       for (int j = 0; j < result.size(); ++j) {
-        const Dtype* result_vec = result[j]->cpu_data();
+        const Dtype* result_vec = result[j]->const_data();
         for (int k = 0; k < result[j]->count(); ++k) {
           test_score[idx++] += result_vec[k];
         }
@@ -272,51 +272,25 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
   }
   Dtype momentum = this->param_.momentum();
   Dtype weight_decay = this->param_.weight_decay();
-  switch (Caffe::mode()) {
-  case Caffe::CPU:
-    for (int param_id = 0; param_id < net_params.size(); ++param_id) {
-      // Compute the value to history, and then copy them to the blob's diff.
-      Dtype local_rate = rate * net_params_lr[param_id];
-      Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
-      caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-          net_params[param_id]->cpu_diff(), momentum,
-          history_[param_id]->mutable_cpu_data());
-      if (local_decay) {
-        // add weight decay
-        caffe_axpy(net_params[param_id]->count(),
-            local_decay * local_rate,
-            net_params[param_id]->cpu_data(),
-            history_[param_id]->mutable_cpu_data());
-      }
-      // copy
-      caffe_copy(net_params[param_id]->count(),
-          history_[param_id]->cpu_data(),
-          net_params[param_id]->mutable_cpu_diff());
-    }
-    break;
-  case Caffe::GPU:
-    for (int param_id = 0; param_id < net_params.size(); ++param_id) {
-      // Compute the value to history, and then copy them to the blob's diff.
-      Dtype local_rate = rate * net_params_lr[param_id];
-      Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
-      caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-          net_params[param_id]->gpu_diff(), momentum,
-          history_[param_id]->mutable_gpu_data());
-      if (local_decay) {
-        // add weight decay
-        caffe_gpu_axpy(net_params[param_id]->count(),
-            local_decay * local_rate,
-            net_params[param_id]->gpu_data(),
-            history_[param_id]->mutable_gpu_data());
-      }
-      // copy
-      caffe_gpu_copy(net_params[param_id]->count(),
-          history_[param_id]->gpu_data(),
-          net_params[param_id]->mutable_gpu_diff());
+  for (int param_id = 0; param_id < net_params.size(); ++param_id) {
+    // Compute the value to history, and then copy them to the blob's diff.
+    Dtype local_rate = rate * net_params_lr[param_id];
+    Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
+    DeviceFactory<Dtype>::GetDevice()->axpby(
+        net_params[param_id]->count(), local_rate,
+        net_params[param_id]->const_diff(), momentum,
+        history_[param_id]->mutable_data());
+    if (local_decay) {
+      // add weight decay
+      DeviceFactory<Dtype>::GetDevice()->axpy(net_params[param_id]->count(),
+          local_decay * local_rate,
+          net_params[param_id]->const_data(),
+          history_[param_id]->mutable_data());
     }
-    break;
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+    // copy
+    DeviceFactory<Dtype>::GetDevice()->copy(net_params[param_id]->count(),
+        history_[param_id]->const_data(),
+        net_params[param_id]->mutable_diff());
   }
 }
 
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index fec37d6e9ec..76c99bcf9bc 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -93,6 +93,29 @@ void* SyncedMemory::mutable_gpu_data() {
   return gpu_ptr_;
 }
 
+const void* SyncedMemory::const_data() {
+  switch (Caffe::mode()) {
+  case Caffe::CPU:
+    return cpu_data();
+  case Caffe::GPU:
+    return gpu_data();
+  default:
+    LOG(FATAL) << "Unknown caffe mode.";
+    return static_cast<void*>(NULL);
+  }
+}
+
+void* SyncedMemory::mutable_data() {
+  switch (Caffe::mode()) {
+  case Caffe::CPU:
+    return mutable_cpu_data();
+  case Caffe::GPU:
+    return mutable_gpu_data();
+  default:
+    LOG(FATAL) << "Unknown caffe mode.";
+    return static_cast<void*>(NULL);
+  }
+}
 
 }  // namespace caffe
 
diff --git a/src/caffe/syncedmem_factory.cpp b/src/caffe/syncedmem_factory.cpp
new file mode 100644
index 00000000000..5e86403e929
--- /dev/null
+++ b/src/caffe/syncedmem_factory.cpp
@@ -0,0 +1,24 @@
+// Copyright 2014 BVLC and contributors.
+
+#include "caffe/syncedmem_factory.hpp"
+
+namespace caffe {
+
+AbstractSyncedMemory* GetSyncedMemory(const size_t size) {
+  switch (Caffe::mode()) {
+  case Caffe::CPU:
+  case Caffe::GPU:
+    return new SyncedMemory(size);
+#ifdef USE_OPENCL
+  case Caffe::OPENCL_CPU:
+  case Caffe::OPENCL_GPU:
+    return new OpenCLSyncedMemory(size);
+#endif
+  default:
+    LOG(FATAL) << "Unknown caffe mode.";
+    return static_cast<AbstractSyncedMemory*>(NULL);
+  }
+}
+
+}  // namespace caffe
+
diff --git a/src/caffe/test/test_concat_layer.cpp b/src/caffe/test/test_concat_layer.cpp
index 72e3c902cf1..66c50723379 100644
--- a/src/caffe/test/test_concat_layer.cpp
+++ b/src/caffe/test/test_concat_layer.cpp
@@ -84,8 +84,8 @@ TYPED_TEST(ConcatLayerTest, TestSetupChannels) {
 
 TYPED_TEST(ConcatLayerTest, TestCPUNum) {
   LayerParameter layer_param;
-  ConcatLayer<TypeParam> layer(layer_param);
   Caffe::set_mode(Caffe::CPU);
+  ConcatLayer<TypeParam> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_0, &(this->blob_top_vec_));
   layer.Forward(this->blob_bottom_vec_0, &(this->blob_top_vec_));
   for (int n = 0; n < this->blob_top_->num(); ++n) {
@@ -93,7 +93,8 @@ TYPED_TEST(ConcatLayerTest, TestCPUNum) {
       for (int h = 0; h < this->blob_top_->height(); ++h) {
         for (int w = 0; w < this->blob_top_->width(); ++w) {
           EXPECT_EQ(this->blob_top_->data_at(n, c, h, w),
-            this->blob_bottom_vec_0[0]->data_at(n, c, h, w));
+            this->blob_bottom_vec_0[0]->data_at(n, c, h, w)) <<
+                "n " << n << ", c " << c << ", h " << h << ", w " << w;
         }
       }
     }
@@ -101,7 +102,8 @@ TYPED_TEST(ConcatLayerTest, TestCPUNum) {
       for (int h = 0; h < this->blob_top_->height(); ++h) {
         for (int w = 0; w < this->blob_top_->width(); ++w) {
           EXPECT_EQ(this->blob_top_->data_at(n, c+3, h, w),
-            this->blob_bottom_vec_0[1]->data_at(n, c, h, w));
+            this->blob_bottom_vec_0[1]->data_at(n, c, h, w)) <<
+                "n " << n << ", c " << c << ", h " << h << ", w " << w;
         }
       }
     }
diff --git a/src/caffe/test/test_opencl_math_functions.cpp b/src/caffe/test/test_opencl_math_functions.cpp
new file mode 100644
index 00000000000..4ff25d82711
--- /dev/null
+++ b/src/caffe/test/test_opencl_math_functions.cpp
@@ -0,0 +1,268 @@
+// Copyright 2014 BVLC and contributors.
+
+#include <stdint.h>  // for uint32_t & uint64_t
+#include <time.h>
+#include <climits>
+#include <cmath>  // for std::fabs
+#include <cstdlib>  // for rand_r
+
+#include "gtest/gtest.h"
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/util/opencl_math_functions.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+template<typename Dtype>
+class OpenCLMathFunctionsTest : public ::testing::Test {
+ protected:
+  OpenCLMathFunctionsTest()
+      : blob_bottom_(new Blob<Dtype>()),
+        blob_bottom2_(new Blob<Dtype>()),
+        blob_top_(new Blob<Dtype>()) {
+  }
+
+  virtual void SetUp() {
+    Caffe::set_random_seed(1701);
+    this->blob_bottom_->Reshape(11, 17, 19, 23);
+    this->blob_bottom2_->Reshape(11, 17, 19, 23);
+    this->blob_top_->Reshape(11, 17, 19, 23);
+    // fill the values
+    FillerParameter filler_param;
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    filler.Fill(this->blob_bottom2_);
+    filler.Fill(this->blob_top_);
+  }
+
+  virtual ~OpenCLMathFunctionsTest() {
+    delete blob_bottom_;
+    delete blob_bottom2_;
+    delete blob_top_;
+  }
+
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_bottom2_;
+  Blob<Dtype>* const blob_top_;
+};
+
+typedef ::testing::Types<float, double> Dtypes;
+TYPED_TEST_CASE(OpenCLMathFunctionsTest, Dtypes);
+
+TYPED_TEST(OpenCLMathFunctionsTest, TestNothing) {
+  // The first test case of a test suite takes the longest time
+  //   due to the set up overhead.
+}
+
+// TODO: Fix caffe_opencl_hamming_distance and re-enable this test.
+TYPED_TEST(OpenCLMathFunctionsTest, DISABLED_TestHammingDistanceOpenCL) {
+  int n = this->blob_bottom_->count();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  const TypeParam* y = this->blob_top_->cpu_data();
+  int reference_distance = this->ReferenceHammingDistance(n, x, y);
+  x = this->blob_bottom_->opencl_data();
+  y = this->blob_top_->opencl_data();
+  int computed_distance = caffe_opencl_hamming_distance<TypeParam>(n, x, y);
+  EXPECT_EQ(reference_distance, computed_distance);
+}
+
+TYPED_TEST(OpenCLMathFunctionsTest, TestAsumOpenCL) {
+  int n = this->blob_bottom_->count();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  TypeParam std_asum = 0;
+  for (int i = 0; i < n; ++i) {
+    std_asum += std::fabs(x[i]);
+  }
+  TypeParam opencl_asum;
+  caffe_opencl_asum<TypeParam>(n, this->blob_bottom_->opencl_data(), &opencl_asum);
+  EXPECT_LT((opencl_asum - std_asum) / std_asum, 1e-2);
+}
+
+TYPED_TEST(OpenCLMathFunctionsTest, TestSignOpenCL) {
+  int n = this->blob_bottom_->count();
+  caffe_opencl_sign<TypeParam>(n, this->blob_bottom_->opencl_data(),
+                            this->blob_bottom_->mutable_opencl_diff());
+  const TypeParam* signs = this->blob_bottom_->cpu_diff();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  for (int i = 0; i < n; ++i) {
+    EXPECT_EQ(signs[i], x[i] > 0 ? 1 : (x[i] < 0 ? -1 : 0));
+  }
+}
+
+TYPED_TEST(OpenCLMathFunctionsTest, TestSgnbitOpenCL) {
+  int n = this->blob_bottom_->count();
+  caffe_opencl_sgnbit<TypeParam>(n, this->blob_bottom_->opencl_data(),
+                            this->blob_bottom_->mutable_opencl_diff());
+  const TypeParam* signbits = this->blob_bottom_->cpu_diff();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  for (int i = 0; i < n; ++i) {
+    EXPECT_EQ(signbits[i], x[i] < 0 ? 1 : 0);
+  }
+}
+
+TYPED_TEST(OpenCLMathFunctionsTest, TestFabsOpenCL) {
+  int n = this->blob_bottom_->count();
+  caffe_opencl_fabs<TypeParam>(n, this->blob_bottom_->opencl_data(),
+                            this->blob_bottom_->mutable_opencl_diff());
+  const TypeParam* abs_val = this->blob_bottom_->cpu_diff();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  for (int i = 0; i < n; ++i) {
+    EXPECT_EQ(abs_val[i], x[i] > 0 ? x[i] : -x[i]);
+  }
+}
+
+TYPED_TEST(OpenCLMathFunctionsTest, TestScaleOpenCL) {
+  int n = this->blob_bottom_->count();
+  TypeParam alpha = this->blob_bottom_->cpu_diff()[caffe_rng_rand() %
+                                                   this->blob_bottom_->count()];
+  caffe_opencl_scale<TypeParam>(n, alpha, this->blob_bottom_->opencl_data(),
+                             this->blob_bottom_->mutable_opencl_diff());
+  const TypeParam* scaled = this->blob_bottom_->cpu_diff();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  for (int i = 0; i < n; ++i) {
+    EXPECT_EQ(scaled[i], x[i] * alpha);
+  }
+}
+
+TYPED_TEST(OpenCLMathFunctionsTest, TestCopyFromCPU) {
+  const int n = this->blob_bottom_->count();
+  const TypeParam* bottom_data = this->blob_bottom_->cpu_data();
+  TypeParam* top_data = this->blob_top_->mutable_cpu_data();
+  caffe_opencl_copy_from_cpu(n, bottom_data, top_data);
+  for (int i = 0; i < n; ++i) {
+    EXPECT_EQ(bottom_data[i], top_data[i]);
+  }
+}
+
+TYPED_TEST(OpenCLMathFunctionsTest, TestCopyOpenCL) {
+  const int n = this->blob_bottom_->count();
+  const TypeParam* bottom_data = this->blob_bottom_->opencl_data();
+  TypeParam* top_data = this->blob_top_->mutable_opencl_data();
+  caffe_opencl_copy(n, bottom_data, top_data);
+  bottom_data = this->blob_bottom_->cpu_data();
+  top_data = this->blob_top_->mutable_cpu_data();
+  for (int i = 0; i < n; ++i) {
+    EXPECT_EQ(bottom_data[i], top_data[i]);
+  }
+}
+
+TYPED_TEST(OpenCLMathFunctionsTest, TestSqrOpenCL) {
+  const int n = this->blob_bottom_->count();
+  const TypeParam* bottom_data = this->blob_bottom_->opencl_data();
+  TypeParam* top_data = this->blob_top_->mutable_opencl_data();
+  caffe_opencl_sqr(n, bottom_data, top_data);
+  bottom_data = this->blob_bottom_->cpu_data();
+  top_data = this->blob_top_->mutable_cpu_data();
+  for (int i = 0; i < n; ++i) {
+    EXPECT_EQ(bottom_data[i], top_data[i]);
+  }
+}
+
+TYPED_TEST(OpenCLMathFunctionsTest, TestExpOpenCL) {
+  const int n = this->blob_bottom_->count();
+  const TypeParam* bottom_data = this->blob_bottom_->opencl_data();
+  TypeParam* top_data = this->blob_top_->mutable_opencl_data();
+  caffe_opencl_exp(n, bottom_data, top_data);
+  bottom_data = this->blob_bottom_->cpu_data();
+  top_data = this->blob_top_->mutable_cpu_data();
+  for (int i = 0; i < n; ++i) {
+    EXPECT_EQ(bottom_data[i], top_data[i]);
+  }
+}
+
+TYPED_TEST(OpenCLMathFunctionsTest, TestSignOpenCL) {
+  const int n = this->blob_bottom_->count();
+  const TypeParam* bottom_data = this->blob_bottom_->opencl_data();
+  TypeParam* top_data = this->blob_top_->mutable_opencl_data();
+  caffe_opencl_sign(n, bottom_data, top_data);
+  bottom_data = this->blob_bottom_->cpu_data();
+  top_data = this->blob_top_->mutable_cpu_data();
+  for (int i = 0; i < n; ++i) {
+    EXPECT_EQ(bottom_data[i], top_data[i]);
+  }
+}
+
+TYPED_TEST(OpenCLMathFunctionsTest, TestSgnbitOpenCL) {
+  const int n = this->blob_bottom_->count();
+  const TypeParam* bottom_data = this->blob_bottom_->opencl_data();
+  TypeParam* top_data = this->blob_top_->mutable_opencl_data();
+  caffe_opencl_sgnbit(n, bottom_data, top_data);
+  bottom_data = this->blob_bottom_->cpu_data();
+  top_data = this->blob_top_->mutable_cpu_data();
+  for (int i = 0; i < n; ++i) {
+    EXPECT_EQ(bottom_data[i], top_data[i]);
+  }
+}
+
+TYPED_TEST(OpenCLMathFunctionsTest, TestFabsOpenCL) {
+  const int n = this->blob_bottom_->count();
+  const TypeParam* bottom_data = this->blob_bottom_->opencl_data();
+  TypeParam* top_data = this->blob_top_->mutable_opencl_data();
+  caffe_opencl_fabs(n, bottom_data, top_data);
+  bottom_data = this->blob_bottom_->cpu_data();
+  top_data = this->blob_top_->mutable_cpu_data();
+  for (int i = 0; i < n; ++i) {
+    EXPECT_EQ(bottom_data[i], top_data[i]);
+  }
+}
+
+TYPED_TEST(OpenCLMathFunctionsTest, TestAddOpenCL) {
+  const int n = this->blob_bottom_->count();
+  const TypeParam* bottom_data = this->blob_bottom_->opencl_data();
+  const TypeParam* bottom2_data = this->blob_bottom2_->opencl_data();
+  TypeParam* top_data = this->blob_top_->mutable_opencl_data();
+  caffe_opencl_add(n, bottom_data, bottom2_data, top_data);
+  bottom_data = this->blob_bottom_->cpu_data();
+  bottom2_data = this->blob_bottom2_->cpu_data();
+  top_data = this->blob_top_->mutable_cpu_data();
+  for (int i = 0; i < n; ++i) {
+    EXPECT_EQ(bottom_data[i] + bottom2_data[i], top_data[i]);
+  }
+}
+
+TYPED_TEST(OpenCLMathFunctionsTest, TestSubOpenCL) {
+  const int n = this->blob_bottom_->count();
+  const TypeParam* bottom_data = this->blob_bottom_->opencl_data();
+  const TypeParam* bottom2_data = this->blob_bottom2_->opencl_data();
+  TypeParam* top_data = this->blob_top_->mutable_opencl_data();
+  caffe_opencl_sub(n, bottom_data, bottom2_data, top_data);
+  bottom_data = this->blob_bottom_->cpu_data();
+  bottom2_data = this->blob_bottom2_->cpu_data();
+  top_data = this->blob_top_->mutable_cpu_data();
+  for (int i = 0; i < n; ++i) {
+    EXPECT_EQ(bottom_data[i] - bottom2_data[i], top_data[i]);
+  }
+}
+
+TYPED_TEST(OpenCLMathFunctionsTest, TestMulOpenCL) {
+  const int n = this->blob_bottom_->count();
+  const TypeParam* bottom_data = this->blob_bottom_->opencl_data();
+  const TypeParam* bottom2_data = this->blob_bottom2_->opencl_data();
+  TypeParam* top_data = this->blob_top_->mutable_opencl_data();
+  caffe_opencl_mul(n, bottom_data, bottom2_data, top_data);
+  bottom_data = this->blob_bottom_->cpu_data();
+  bottom2_data = this->blob_bottom2_->cpu_data();
+  top_data = this->blob_top_->mutable_cpu_data();
+  for (int i = 0; i < n; ++i) {
+    EXPECT_EQ(bottom_data[i] * bottom2_data[i], top_data[i]);
+  }
+}
+
+TYPED_TEST(OpenCLMathFunctionsTest, TestDivOpenCL) {
+  const int n = this->blob_bottom_->count();
+  const TypeParam* bottom_data = this->blob_bottom_->opencl_data();
+  const TypeParam* bottom2_data = this->blob_bottom2_->opencl_data();
+  TypeParam* top_data = this->blob_top_->mutable_opencl_data();
+  caffe_opencl_div(n, bottom_data, bottom2_data, top_data);
+  bottom_data = this->blob_bottom_->cpu_data();
+  bottom2_data = this->blob_bottom2_->cpu_data();
+  top_data = this->blob_top_->mutable_cpu_data();
+  for (int i = 0; i < n; ++i) {
+    EXPECT_EQ(bottom_data[i] / std::min(bottom2_data[i], 1e-5), top_data[i]);
+  }
+}
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_opencl_syncedmem.cpp b/src/caffe/test/test_opencl_syncedmem.cpp
new file mode 100644
index 00000000000..0724298356b
--- /dev/null
+++ b/src/caffe/test/test_opencl_syncedmem.cpp
@@ -0,0 +1,90 @@
+// Copyright 2014 BVLC and contributors.
+
+#include <cstring>
+#include <vector>
+
+#include "cuda_runtime.h"
+#include "gtest/gtest.h"
+#include "caffe/common.hpp"
+#include "caffe/opencl_syncedmem.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+class OpenCLSyncedMemoryTest : public ::testing::Test {};
+
+TEST_F(OpenCLSyncedMemoryTest, TestInitialization) {
+  OpenCLSyncedMemory mem(10);
+  EXPECT_EQ(mem.head(), OpenCLSyncedMemory::UNINITIALIZED);
+  EXPECT_EQ(mem.size(), 10);
+  OpenCLSyncedMemory* p_mem = new OpenCLSyncedMemory(10 * sizeof(float));
+  EXPECT_EQ(p_mem->size(), 10 * sizeof(float));
+  delete p_mem;
+}
+
+TEST_F(OpenCLSyncedMemoryTest, TestAllocation) {
+  OpenCLSyncedMemory mem(10);
+  EXPECT_TRUE(mem.cpu_data());
+  EXPECT_TRUE(mem.gpu_data());
+  EXPECT_TRUE(mem.mutable_cpu_data());
+  EXPECT_TRUE(mem.mutable_gpu_data());
+}
+
+TEST_F(OpenCLSyncedMemoryTest, TestCPUWrite) {
+  OpenCLSyncedMemory mem(10);
+  void* cpu_data = mem.mutable_cpu_data();
+  EXPECT_EQ(mem.head(), OpenCLSyncedMemory::HEAD_AT_CPU);
+  memset(cpu_data, 1, mem.size());
+  for (int i = 0; i < mem.size(); ++i) {
+    EXPECT_EQ((reinterpret_cast<char*>(cpu_data))[i], 1);
+  }
+  const void* gpu_data = mem.gpu_data();
+  EXPECT_EQ(mem.head(), OpenCLSyncedMemory::SYNCED);
+  // check if values are the same
+  char* recovered_value = new char[10];
+  cudaMemcpy(reinterpret_cast<void*>(recovered_value), gpu_data, 10,
+             cudaMemcpyDeviceToHost);
+  for (int i = 0; i < mem.size(); ++i) {
+    EXPECT_EQ((reinterpret_cast<char*>(recovered_value))[i], 1);
+  }
+  // do another round
+  cpu_data = mem.mutable_cpu_data();
+  EXPECT_EQ(mem.head(), OpenCLSyncedMemory::HEAD_AT_CPU);
+  memset(cpu_data, 2, mem.size());
+  for (int i = 0; i < mem.size(); ++i) {
+    EXPECT_EQ((reinterpret_cast<char*>(cpu_data))[i], 2);
+  }
+  gpu_data = mem.gpu_data();
+  EXPECT_EQ(mem.head(), OpenCLSyncedMemory::SYNCED);
+  // check if values are the same
+  cudaMemcpy(reinterpret_cast<void*>(recovered_value), gpu_data, 10,
+             cudaMemcpyDeviceToHost);
+  for (int i = 0; i < mem.size(); ++i) {
+    EXPECT_EQ((reinterpret_cast<char*>(recovered_value))[i], 2);
+  }
+  delete[] recovered_value;
+}
+
+TEST_F(OpenCLSyncedMemoryTest, TestGPUWrite) {
+  OpenCLSyncedMemory mem(10);
+  void* gpu_data = mem.mutable_gpu_data();
+  EXPECT_EQ(mem.head(), OpenCLSyncedMemory::HEAD_AT_GPU);
+  CUDA_CHECK(cudaMemset(gpu_data, 1, mem.size()));
+  const void* cpu_data = mem.cpu_data();
+  for (int i = 0; i < mem.size(); ++i) {
+    EXPECT_EQ((reinterpret_cast<const char*>(cpu_data))[i], 1);
+  }
+  EXPECT_EQ(mem.head(), OpenCLSyncedMemory::SYNCED);
+
+  gpu_data = mem.mutable_gpu_data();
+  EXPECT_EQ(mem.head(), OpenCLSyncedMemory::HEAD_AT_GPU);
+  CUDA_CHECK(cudaMemset(gpu_data, 2, mem.size()));
+  cpu_data = mem.cpu_data();
+  for (int i = 0; i < mem.size(); ++i) {
+    EXPECT_EQ((reinterpret_cast<const char*>(cpu_data))[i], 2);
+  }
+  EXPECT_EQ(mem.head(), OpenCLSyncedMemory::SYNCED);
+}
+
+}  // namespace caffe
diff --git a/src/caffe/util/cpu_device.cpp b/src/caffe/util/cpu_device.cpp
new file mode 100644
index 00000000000..72e3af12025
--- /dev/null
+++ b/src/caffe/util/cpu_device.cpp
@@ -0,0 +1,175 @@
+// Copyright 2014 BVLC and contributors.
+
+#include "caffe/common.hpp"
+#include "caffe/util/device.hpp"
+
+namespace caffe {
+template<typename Dtype>
+void CPUDevice<Dtype>::gemm(const CBLAS_TRANSPOSE TransA,
+                                 const CBLAS_TRANSPOSE TransB, const int M,
+                                 const int N, const int K, const Dtype alpha,
+                                 const Dtype* A, const Dtype* B,
+                                 const Dtype beta, Dtype* C) {
+  caffe_cpu_gemm<Dtype>(TransA, TransB, M, N, K, alpha, A, B, beta, C);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::gemv(const CBLAS_TRANSPOSE TransA, const int M,
+                                 const int N, const Dtype alpha, const Dtype* A,
+                                 const Dtype* x, const Dtype beta, Dtype* y) {
+  caffe_cpu_gemv<Dtype>(TransA, M, N, alpha, A, x, beta, y);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::axpy(const int N, const Dtype alpha, const Dtype* X,
+                                 Dtype* Y) {
+  caffe_axpy<Dtype>(N, alpha, X, Y);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::axpby(const int N, const Dtype alpha,
+                                  const Dtype* X, const Dtype beta, Dtype* Y) {
+  caffe_cpu_axpby<Dtype>(N, alpha, X, beta, Y);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::copy(const int N, const Dtype *X, Dtype *Y) {
+  caffe_copy<Dtype>(N, X, Y);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::copy_from_cpu(const int N, const Dtype *X, Dtype *Y) {
+  caffe_copy<Dtype>(N, X, Y);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::set(const int N, const Dtype alpha, Dtype *X) {
+  caffe_set<Dtype>(N, alpha, X);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::add_scalar(const int N, const Dtype alpha,
+                                       Dtype *X) {
+  caffe_add_scalar<Dtype>(N, alpha, X);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::scal(const int N, const Dtype alpha, Dtype *X) {
+  caffe_scal<Dtype>(N, alpha, X);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::sqr(const int N, const Dtype* a, Dtype* y) {
+  caffe_sqr<Dtype>(N, a, y);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::add(const int N, const Dtype* a, const Dtype* b,
+                                Dtype* y) {
+  caffe_add<Dtype>(N, a, b, y);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::sub(const int N, const Dtype* a, const Dtype* b,
+                                Dtype* y) {
+  caffe_sub<Dtype>(N, a, b, y);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::mul(const int N, const Dtype* a, const Dtype* b,
+                                Dtype* y) {
+  caffe_mul<Dtype>(N, a, b, y);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::div(const int N, const Dtype* a, const Dtype* b,
+                                Dtype* y) {
+  caffe_div<Dtype>(N, a, b, y);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::powx(const int N, const Dtype* a, const Dtype b,
+                                 Dtype* y) {
+  caffe_powx<Dtype>(N, a, b, y);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::rng_uniform(const int N, const Dtype a,
+                                        const Dtype b, Dtype* r) {
+  caffe_rng_uniform<Dtype>(N, a, b, r);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::rng_gaussian(const int N, const Dtype mu,
+                                         const Dtype sigma, Dtype* r) {
+  caffe_rng_gaussian<Dtype>(N, mu, sigma, r);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::rng_bernoulli(const int N, const Dtype p, int* r) {
+  caffe_rng_bernoulli<Dtype>(N, p, r);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::exp(const int N, const Dtype* a, Dtype* y) {
+  caffe_exp<Dtype>(N, a, y);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::dot(const int N, const Dtype* x, const Dtype* y,
+                                Dtype* out) {
+  *out = caffe_cpu_dot<Dtype>(N, x, y);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::hamming_distance(const int N, const Dtype* x,
+                                             const Dtype* y, uint32_t* out) {
+  *out = caffe_cpu_hamming_distance<Dtype>(N, x, y);
+}
+
+template<typename Dtype>
+// Returns the sum of the absolute values of the elements of vector x
+void CPUDevice<Dtype>::asum(const int N, const Dtype* x, Dtype* y) {
+  *y = caffe_cpu_asum<Dtype>(N, x);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::sign(const int N, const Dtype* x, Dtype* y) {
+  caffe_cpu_sign<Dtype>(N, x, y);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::sgnbit(const int N, const Dtype* x, Dtype* y) {
+  caffe_gpu_sgnbit<Dtype>(N, x, y);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::fabs(const int N, const Dtype* x, Dtype* y) {
+  caffe_cpu_fabs<Dtype>(N, x, y);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::scale(const int N, const Dtype alpha,
+                                  const Dtype *x, Dtype* y) {
+  caffe_cpu_scale<Dtype>(N, alpha, x, y);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::im2col(const Dtype* data_im, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, Dtype* data_col) {
+  im2col_cpu(data_im, channels, height, width, ksize, pad, stride,
+             data_col);
+}
+
+template<typename Dtype>
+void CPUDevice<Dtype>::col2im(const Dtype* data_col, const int channels,
+    const int height, const int width, const int psize, const int pad,
+    const int stride, Dtype* data_im) {
+  col2im_cpu(data_col, channels, height, width, psize, pad, stride,
+             data_im);
+}
+
+INSTANTIATE_CLASS(CPUDevice);
+
+}  // namespace caffe
diff --git a/src/caffe/util/device.cpp b/src/caffe/util/device.cpp
new file mode 100644
index 00000000000..bb25372122f
--- /dev/null
+++ b/src/caffe/util/device.cpp
@@ -0,0 +1,42 @@
+// Copyright 2014 BVLC and contributors.
+
+#include "caffe/common.hpp"
+#include "caffe/util/device.hpp"
+#ifdef USE_OPENCL
+#include "caffe/util/opencl_device.hpp"
+#endif
+
+namespace caffe {
+
+template<typename Dtype>
+Device<Dtype>* DeviceFactory<Dtype>::GetDevice() {
+  switch (Caffe::mode()) {
+  case Caffe::CPU:
+    return cpu_device_;
+  case Caffe::GPU:
+    return gpu_device_;
+#ifdef USE_OPENCL
+  case Caffe::OPENCL_CPU:
+  case Caffe::OPENCL_GPU:
+    return opencl_device_;
+#endif
+  default:
+    LOG(FATAL) << "Unknown caffe mode.";
+    return static_cast<Device<Dtype>*>(NULL);
+  }
+}
+
+template<typename Dtype>
+Device<Dtype>* DeviceFactory<Dtype>::cpu_device_ = new CPUDevice<Dtype>();
+
+template<typename Dtype>
+Device<Dtype>* DeviceFactory<Dtype>::gpu_device_ = new GPUDevice<Dtype>();
+
+#ifdef USE_OPENCL
+template<typename Dtype>
+Device<Dtype>* DeviceFactory<Dtype>::opencl_device_ = new OpenCLDevice<Dtype>();
+#endif
+
+INSTANTIATE_CLASS(DeviceFactory);
+
+}  // namespace caffe
diff --git a/src/caffe/util/gpu_device.cpp b/src/caffe/util/gpu_device.cpp
new file mode 100644
index 00000000000..d5083801b55
--- /dev/null
+++ b/src/caffe/util/gpu_device.cpp
@@ -0,0 +1,178 @@
+// Copyright 2014 BVLC and contributors.
+
+#include "caffe/common.hpp"
+#include "caffe/util/device.hpp"
+
+namespace caffe {
+template<typename Dtype>
+void GPUDevice<Dtype>::gemm(const CBLAS_TRANSPOSE TransA,
+                                 const CBLAS_TRANSPOSE TransB, const int M,
+                                 const int N, const int K, const Dtype alpha,
+                                 const Dtype* A, const Dtype* B,
+                                 const Dtype beta, Dtype* C) {
+  caffe_gpu_gemm<Dtype>(TransA, TransB, M, N, K, alpha, A, B, beta, C);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::gemv(const CBLAS_TRANSPOSE TransA, const int M,
+                                 const int N, const Dtype alpha, const Dtype* A,
+                                 const Dtype* x, const Dtype beta, Dtype* y) {
+  caffe_gpu_gemv<Dtype>(TransA, M, N, alpha, A, x, beta, y);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::axpy(const int N, const Dtype alpha, const Dtype* X,
+                                 Dtype* Y) {
+  caffe_gpu_axpy<Dtype>(N, alpha, X, Y);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::axpby(const int N, const Dtype alpha,
+                                  const Dtype* X, const Dtype beta, Dtype* Y) {
+  caffe_gpu_axpby<Dtype>(N, alpha, X, beta, Y);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::copy(const int N, const Dtype *X, Dtype *Y) {
+  caffe_gpu_copy<Dtype>(N, X, Y);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::copy_from_cpu(const int N, const Dtype *X, Dtype *Y) {
+  CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyHostToDevice));
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::set(const int N, const Dtype alpha, Dtype *X) {
+  caffe_gpu_set<Dtype>(N, alpha, X);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::add_scalar(const int N, const Dtype alpha,
+                                       Dtype *X) {
+  caffe_gpu_add_scalar<Dtype>(N, alpha, X);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::scal(const int N, const Dtype alpha, Dtype *X) {
+  caffe_gpu_scal<Dtype>(N, alpha, X);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::sqr(const int N, const Dtype* a, Dtype* y) {
+  NOT_IMPLEMENTED;
+//	caffe_gpu_sqr<Dtype>(N, a, y);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::add(const int N, const Dtype* a, const Dtype* b,
+                                Dtype* y) {
+	caffe_gpu_add<Dtype>(N, a, b, y);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::sub(const int N, const Dtype* a, const Dtype* b,
+                                Dtype* y) {
+	caffe_gpu_sub<Dtype>(N, a, b, y);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::mul(const int N, const Dtype* a, const Dtype* b,
+                                Dtype* y) {
+  caffe_gpu_mul<Dtype>(N, a, b, y);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::div(const int N, const Dtype* a, const Dtype* b,
+                                Dtype* y) {
+  caffe_gpu_div<Dtype>(N, a, b, y);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::powx(const int N, const Dtype* a, const Dtype b,
+                                 Dtype* y) {
+  caffe_gpu_powx<Dtype>(N, a, b, y);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::rng_uniform(const int N, const Dtype a,
+                                        const Dtype b, Dtype* r) {
+  caffe_gpu_rng_uniform<Dtype>(N, a, b, r);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::rng_gaussian(const int N, const Dtype mu,
+                                         const Dtype sigma, Dtype* r) {
+  caffe_gpu_rng_gaussian<Dtype>(N, mu, sigma, r);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::rng_bernoulli(const int N, const Dtype p, int* r) {
+  NOT_IMPLEMENTED;
+//	caffe_gpu_rng_bernoulli<Dtype>(N, p, r);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::exp(const int N, const Dtype* a, Dtype* y) {
+  NOT_IMPLEMENTED;
+//	caffe_gpu_exp<Dtype>(N, a, y);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::dot(const int N, const Dtype* x, const Dtype* y,
+                                Dtype* out) {
+  caffe_gpu_dot<Dtype>(N, x, y, out);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::hamming_distance(const int N, const Dtype* x,
+                                             const Dtype* y, uint32_t* out) {
+  *out = caffe_gpu_hamming_distance<Dtype>(N, x, y);
+}
+
+template<typename Dtype>
+// Returns the sum of the absolute values of the elements of vector x
+void GPUDevice<Dtype>::asum(const int N, const Dtype* x, Dtype* y) {
+  caffe_gpu_asum<Dtype>(N, x, y);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::sign(const int N, const Dtype* x, Dtype* y) {
+  caffe_gpu_sign<Dtype>(N, x, y);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::sgnbit(const int N, const Dtype* x, Dtype* y) {
+  caffe_gpu_sgnbit<Dtype>(N, x, y);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::fabs(const int N, const Dtype* x, Dtype* y) {
+  caffe_gpu_fabs<Dtype>(N, x, y);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::scale(const int N, const Dtype alpha,
+                                  const Dtype *x, Dtype* y) {
+  caffe_gpu_scale<Dtype>(N, alpha, x, y);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::im2col(const Dtype* data_im, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, Dtype* data_col) {
+  im2col_gpu(data_im, channels, height, width, ksize, pad, stride,
+             data_col);
+}
+
+template<typename Dtype>
+void GPUDevice<Dtype>::col2im(const Dtype* data_col, const int channels,
+    const int height, const int width, const int psize, const int pad,
+    const int stride, Dtype* data_im) {
+  col2im_gpu(data_col, channels, height, width, psize, pad, stride,
+             data_im);
+}
+
+INSTANTIATE_CLASS(GPUDevice);
+
+}  // namespace caffe
diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp
new file mode 100644
index 00000000000..a1099e11100
--- /dev/null
+++ b/src/caffe/util/opencl_device.cpp
@@ -0,0 +1,383 @@
+// Copyright 2014 BVLC and contributors.
+
+#ifdef USE_OPENCL
+#include "caffe/common.hpp"
+#include "caffe/util/opencl_device.hpp"
+#include "caffe/util/opencl_math_functions.hpp"
+
+#include <vector>
+
+namespace caffe {
+
+shared_ptr<CaffeOpenCL> CaffeOpenCL::singleton_;
+
+cl_device_type CaffeOpenCL::get_device_type() {
+  switch (Caffe::mode()) {
+  case Caffe::OPENCL_CPU:
+    return CL_DEVICE_TYPE_CPU;
+  case Caffe::OPENCL_GPU:
+    return CL_DEVICE_TYPE_GPU;
+  case Caffe::OPENCL_ALL:
+  default:
+    return CL_DEVICE_TYPE_ALL;
+  }
+}
+
+/**
+ * http://dhruba.name/2012/08/14/opencl-cookbook-listing-all-devices-and-their-critical-attributes/
+ */
+void CaffeOpenCL::create_context() {
+  cl_uint platformCount;
+  CL_CHECK(clGetPlatformIDs(0, NULL, &platformCount));
+
+  cl_platform_id* platforms = (cl_platform_id*)
+      malloc(sizeof(cl_platform_id) * platformCount);
+  CL_CHECK(clGetPlatformIDs(1, platforms, NULL));
+
+  cl_uint device_count;
+  cl_device_type device_type = get_device_type();
+  int num_devices_to_skip = current_device_id_;
+  while (num_devices_to_skip >= 0) {
+    for (int i = 0; i < platformCount; i++) {
+      cl_context_properties properties[] = {
+          CL_CONTEXT_PLATFORM, (cl_context_properties)(
+              platforms[i]), 0};
+      // get all devices
+      clGetDeviceIDs(platforms[i], device_type, 0, NULL, &device_count);
+      if (num_devices_to_skip <= device_count) {
+        current_cl_platform_id_ = platforms[i];
+        current_platform_device_count_ = device_count;
+        current_platform_device_id_ = num_devices_to_skip;
+        current_platform_device_ids_.resize(device_count);
+        CL_CHECK(clGetDeviceIDs(current_cl_platform_id_, device_type,
+                                current_platform_device_count_,
+                                &(current_platform_device_ids_[0]), NULL));
+        cl_int error = CL_SUCCESS;   // Used to handle error codes
+/*
+ * http://dhruba.name/2012/10/14/opencl-cookbook-how-to-leverage-multiple-devices-in-opencl/
+ * https://software.intel.com/sites/products/documentation/ioclsdk/2013/OG/Using_Shared_Context_for_Multiple_OpenCL_Devices.htm
+ */
+        cl_context_ = clCreateContext(
+            properties, device_count, &(current_platform_device_ids_[0]),
+            NULL, NULL, &error);
+        CL_CHECK(error);
+      }
+      num_devices_to_skip -= device_count;
+      if (num_devices_to_skip < 0) {
+        break;
+      }
+    }
+  }
+}
+
+cl_device_id CaffeOpenCL::current_cl_device_id() {
+  // To initialize current platform info
+  context();
+  return current_platform_device_ids_[current_platform_device_id_];
+}
+
+void CaffeOpenCL::create_queue() {
+  cl_int error = 0;   // Used to handle error codes
+  cl_command_queue_properties properties = 0;
+  cl_command_queue_ = clCreateCommandQueue(
+      context(), current_cl_device_id(), properties, &error);
+  CL_CHECK(error);
+}
+
+void CaffeOpenCL::release_context() {
+  CL_CHECK(clReleaseContext(cl_context_));
+  cl_context_ = NULL;
+}
+
+void CaffeOpenCL::release_queue() {
+  CL_CHECK(clReleaseCommandQueue(cl_command_queue_));
+  cl_command_queue_ = NULL;
+}
+
+void CaffeOpenCL::SetDevice(const int device_id) {
+  if (current_device_id_ != device_id) {
+    current_device_id_ = device_id;
+    release_queue();
+    // TODO: reuse context for the devices of the same platform
+    release_context();
+    context();
+    finalize_clblas();
+    initialize_clblas();
+  }
+}
+
+void CaffeOpenCL::initialize_clblas() {
+  if (!clblas_initialized_) {
+    CLBLAS_CHECK(clblasSetup());
+    clblas_initialized_ = true;
+  }
+}
+
+void CaffeOpenCL::finalize_clblas() {
+  if (clblas_initialized_) {
+    clblasTeardown();
+    clblas_initialized_ = false;
+  }
+}
+
+template<typename Dtype>
+void OpenCLDevice<Dtype>::gemm(const CBLAS_TRANSPOSE TransA,
+                        const CBLAS_TRANSPOSE TransB, const int M,
+                        const int N, const int K, const Dtype alpha,
+                        const Dtype* A, const Dtype* B,
+                        const Dtype beta, Dtype* C) {
+  caffe_opencl_gemm(TransA, TransB, M, N, K, alpha, A, B, beta, C);
+}
+
+template<typename Dtype>
+void OpenCLDevice<Dtype>::gemv(const CBLAS_TRANSPOSE TransA, const int M,
+                        const int N, const Dtype alpha, const Dtype* A,
+                        const Dtype* x, const Dtype beta, Dtype* y) {
+  caffe_opencl_gemv(TransA, M, N, alpha, A, x, beta, y);
+}
+
+template<typename Dtype>
+void OpenCLDevice<Dtype>::axpy(const int N, const Dtype alpha,
+                        const Dtype* X, Dtype* Y) {
+  caffe_opencl_axpy(N, alpha, X, Y);
+}
+
+template<typename Dtype>
+void OpenCLDevice<Dtype>::scal(const int N, const Dtype alpha, Dtype *X) {
+  caffe_opencl_scal(N, alpha, X);
+}
+
+template<typename Dtype>
+void OpenCLDevice<Dtype>::axpby(
+    const int N, const Dtype alpha, const Dtype* X,
+    const Dtype beta, Dtype* Y) {
+  caffe_opencl_axpby(N, alpha, X, beta, Y);
+}
+
+template<typename Dtype>
+void OpenCLDevice<Dtype>::copy(const int N, const Dtype *X, Dtype *Y) {
+  caffe_opencl_copy(N, X, Y);
+}
+
+
+/**
+ * http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clEnqueueWriteBuffer.html
+ */
+template<typename Dtype>
+void OpenCLDevice<Dtype>::copy_from_cpu(const int N, const Dtype *X,
+                                        Dtype *Y) {
+  caffe_opencl_copy_from_cpu(N, X, Y);
+}
+
+/**
+ * http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clEnqueueFillBuffer.html
+ */
+template<typename Dtype>
+void OpenCLDevice<Dtype>::set(const int N, const Dtype alpha, Dtype *X) {
+  caffe_opencl_set(N, alpha, X);
+}
+
+
+//template<typename Dtype>
+//void OpenCLDevice<Dtype>::add_scalar(const int N, const Dtype alpha,
+//                                     Dtype *X) {
+//  NOT_IMPLEMENTED;
+//}
+
+//template<typename Dtype>
+//void OpenCLDevice<Dtype>::powx(const int N, const Dtype* a, const Dtype b,
+//                               Dtype* y) {
+//  NOT_IMPLEMENTED;
+////  caffe_gpu_powx<Dtype>(N, a, b, y);
+//}
+
+//template<typename Dtype>
+//void OpenCLDevice<Dtype>::rng_uniform(const int N, const Dtype a,
+//                                      const Dtype b, Dtype* r) {
+//  NOT_IMPLEMENTED;
+////  caffe_gpu_rng_uniform<Dtype>(N, a, b, r);
+//}
+
+//template<typename Dtype>
+//void OpenCLDevice<Dtype>::rng_gaussian(const int N, const Dtype mu,
+//                                       const Dtype sigma, Dtype* r) {
+//  NOT_IMPLEMENTED;
+////  caffe_gpu_rng_gaussian<Dtype>(N, mu, sigma, r);
+//}
+
+//template<typename Dtype>
+//void OpenCLDevice<Dtype>::rng_bernoulli(const int N, const Dtype p, int* r) {
+//  NOT_IMPLEMENTED;
+////  caffe_gpu_rng_bernoulli<Dtype>(N, p, r);
+//}
+
+//template<typename Dtype>
+//void OpenCLDevice<Dtype>::dot(const int N, const Dtype* x, const Dtype* y,
+//                              Dtype* out) {
+//  NOT_IMPLEMENTED;
+////  caffe_gpu_dot<Dtype>(N, x, y, out);
+//}
+
+//template<typename Dtype>
+//void OpenCLDevice<Dtype>::hamming_distance(const int N, const Dtype* x,
+//                                           const Dtype* y, uint32_t* out) {
+//  NOT_IMPLEMENTED;
+////  *out = caffe_gpu_hamming_distance<Dtype>(N, x, y);
+//}
+
+/**
+ *
+clblasSasum(
+    size_t N,
+    cl_mem asum,
+    size_t offAsum,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+ */
+//template<typename Dtype>
+//void OpenCLDevice<Dtype>::asum(const int N, const Dtype* x, Dtype* y) {
+//  NOT_IMPLEMENTED;
+////  CREATE_CL_MEM(x, N, 1, READ_ONLY);
+////  CREATE_CL_MEM(y, N, 1, READ_WRITE);
+////  PRE_CLBLAS_CALL;
+////  CLBLAS_CHECK(clblasSasum(
+////      N, alpha, ARRAY(X),
+////      CLBALS_TRAILING_ARGS));
+//}
+
+//template<typename Dtype>
+//void OpenCLDevice<Dtype>::scale(const int N, const Dtype alpha,
+//                                const Dtype *x, Dtype* y) {
+//  this->copy(N, x, y);
+//  this->scal(N, alpha, y);
+//}
+
+//template<typename Dtype>
+//void OpenCLDevice<Dtype>::im2col(
+//    const Dtype* data_im, const int channels,
+//    const int height, const int width, const int ksize, const int pad,
+//    const int stride, Dtype* data_col) {
+////  NOT_IMPLEMENTED;
+////  im2col_gpu(data_im, channels, height, width, ksize, pad, stride,
+////             data_col);
+//}
+
+//template<typename Dtype>
+//void OpenCLDevice<Dtype>::col2im(
+//    const Dtype* data_col, const int channels,
+//    const int height, const int width, const int psize, const int pad,
+//    const int stride, Dtype* data_im) {
+////  NOT_IMPLEMENTED;
+////  col2im_gpu(data_col, channels, height, width, psize, pad, stride,
+////             data_im);
+//}
+
+
+INSTANTIATE_CLASS(OpenCLDevice);
+
+const char* clGetErrorString(cl_int error) {
+  switch (error) {
+  case CL_SUCCESS:
+    return "CL_SUCCESS";
+  case CL_INVALID_VALUE:
+    return "CL_INVALID_VALUE";
+  case CL_INVALID_COMMAND_QUEUE:
+    return "CL_INVALID_COMMAND_QUEUE";
+  case CL_INVALID_CONTEXT:
+    return "CL_INVALID_CONTEXT";
+  case CL_INVALID_MEM_OBJECT:
+    return "CL_INVALID_MEM_OBJECT";
+  case CL_INVALID_DEVICE:
+    return "CL_INVALID_DEVICE";
+  case CL_INVALID_EVENT_WAIT_LIST:
+    return "CL_INVALID_EVENT_WAIT_LIST";
+  case CL_OUT_OF_RESOURCES:
+    return "CL_OUT_OF_RESOURCES";
+  case CL_OUT_OF_HOST_MEMORY:
+    return "CL_OUT_OF_HOST_MEMORY";
+  case CL_INVALID_OPERATION:
+    return "CL_INVALID_OPERATION";
+  case CL_COMPILER_NOT_AVAILABLE:
+    return "CL_COMPILER_NOT_AVAILABLE";
+  case CL_BUILD_PROGRAM_FAILURE:
+    return "CL_BUILD_PROGRAM_FAILURE";
+  }
+  return "Unknown OpenCL error";
+}
+
+const char* clblasGetErrorString(clblasStatus status) {
+  switch (status) {
+  case clblasSuccess:
+    return "clblasSuccess";
+  case clblasInvalidValue:
+    return "clblasInvalidValue";
+  case clblasInvalidCommandQueue:
+    return "clblasInvalidCommandQueue";
+  case clblasInvalidContext:
+    return "clblasInvalidContext";
+  case clblasInvalidMemObject:
+    return "clblasInvalidMemObject";
+  case clblasInvalidDevice:
+    return "clblasInvalidDevice";
+  case clblasInvalidEventWaitList:
+    return "clblasInvalidEventWaitList";
+  case clblasOutOfResources:
+    return "clblasOutOfResources";
+  case clblasOutOfHostMemory:
+    return "clblasOutOfHostMemory";
+  case clblasInvalidOperation:
+    return "clblasInvalidOperation";
+  case clblasCompilerNotAvailable:
+    return "clblasCompilerNotAvailable";
+  case clblasBuildProgramFailure:
+    return "clblasBuildProgramFailure";
+  case clblasNotImplemented:
+    return "clblasNotImplemented";
+  case clblasNotInitialized:
+    return "clblasNotInitialized";
+  case clblasInvalidMatA:
+    return "clblasInvalidMatA";
+  case clblasInvalidMatB:
+    return "clblasInvalidMatB";
+  case clblasInvalidMatC:
+    return "clblasInvalidMatC";
+  case clblasInvalidVecX:
+    return "clblasInvalidVecX";
+  case clblasInvalidVecY:
+    return "clblasInvalidVecY";
+  case clblasInvalidDim:
+    return "clblasInvalidDim";
+  case clblasInvalidLeadDimA:
+    return "clblasInvalidLeadDimA";
+  case clblasInvalidLeadDimB:
+    return "clblasInvalidLeadDimB";
+  case clblasInvalidLeadDimC:
+    return "clblasInvalidLeadDimC";
+  case clblasInvalidIncX:
+    return "clblasInvalidIncX";
+  case clblasInvalidIncY:
+    return "clblasInvalidIncY";
+  case clblasInsufficientMemMatA:
+    return "clblasInsufficientMemMatA";
+  case clblasInsufficientMemMatB:
+    return "clblasInsufficientMemMatB";
+  case clblasInsufficientMemMatC:
+    return "clblasInsufficientMemMatC";
+  case clblasInsufficientMemVecX:
+    return "clblasInsufficientMemVecX";
+  case clblasInsufficientMemVecY:
+    return "clblasInsufficientMemVecY";
+  }
+  return "Unknown clblas status";
+}
+
+}  // namespace caffe
+
+#endif  // USE_OPENCL
diff --git a/src/caffe/util/opencl_math_functions.cpp b/src/caffe/util/opencl_math_functions.cpp
new file mode 100644
index 00000000000..7f9cbc0f3a3
--- /dev/null
+++ b/src/caffe/util/opencl_math_functions.cpp
@@ -0,0 +1,234 @@
+// Copyright 2014 BVLC and contributors.
+
+#ifdef USE_OPENCL
+//#include "caffe/common.hpp"
+#include "caffe/util/opencl_math_functions.hpp"
+
+namespace caffe {
+
+template <>
+void caffe_opencl_gemm<float>(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
+  int ldA = (TransA == CblasNoTrans) ? K : M;
+  int ldB = (TransB == CblasNoTrans) ? N : K;
+  int ldC = N;
+  clblasTranspose clTransA = to_clblasTranspose(TransA);
+  clblasTranspose clTransB = to_clblasTranspose(TransB);
+  CREATE_CL_MEM(A, M, K, READ_ONLY);
+  CREATE_CL_MEM(B, K, N, READ_ONLY);
+  CREATE_CL_MEM(C, M, N, READ_WRITE);
+  ENQUEUE_CL_BUFFER(Write, A, M, K);
+  ENQUEUE_CL_BUFFER(Write, B, K, N);
+  ENQUEUE_CL_BUFFER(Write, C, M, N);
+  PRE_CLBLAS_CALL;
+  // bufX is defined by the macro CREATE_CL_MEM(X, ...)
+  CLBLAS_CHECK(clblasSgemm(clblasRowMajor, clTransA, clTransB,
+      M, N, K, alpha, ARRAY(A), ARRAY(B), beta, ARRAY(C),
+      CLBLAS_TRAILING_ARGS));
+  /* Release OpenCL memory objects. */
+  RELEASE_CL_MEM(C);
+  RELEASE_CL_MEM(B);
+  RELEASE_CL_MEM(A);
+}
+
+template <>
+void caffe_opencl_gemm<double>(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C) {
+  int ldA = (TransA == CblasNoTrans) ? K : M;
+  int ldB = (TransB == CblasNoTrans) ? N : K;
+  int ldC = N;
+  clblasTranspose clTransA = to_clblasTranspose(TransA);
+  clblasTranspose clTransB = to_clblasTranspose(TransB);
+  CREATE_CL_MEM(A, M, K, READ_ONLY);
+  CREATE_CL_MEM(B, K, N, READ_ONLY);
+  CREATE_CL_MEM(C, M, N, READ_WRITE);
+  ENQUEUE_CL_BUFFER(Write, A, M, K);
+  ENQUEUE_CL_BUFFER(Write, B, K, N);
+  ENQUEUE_CL_BUFFER(Write, C, M, N);
+  PRE_CLBLAS_CALL;
+  // bufX is defined by the macro CREATE_CL_MEM(X, ...)
+  CLBLAS_CHECK(clblasDgemm(clblasRowMajor, clTransA, clTransB,
+      M, N, K, alpha, ARRAY(A), ARRAY(B), beta, ARRAY(C),
+      CLBLAS_TRAILING_ARGS));
+  /* Release OpenCL memory objects. */
+  RELEASE_CL_MEM(C);
+  RELEASE_CL_MEM(B);
+  RELEASE_CL_MEM(A);
+}
+
+template <>
+void caffe_opencl_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
+    const int N, const float alpha, const float* A, const float* x,
+    const float beta, float* y) {
+  clblasTranspose clTransA = to_clblasTranspose(TransA);
+  int ldA = (TransA == CblasNoTrans) ? N : M;
+  int ldx = N;
+  int ldy = N;
+  CREATE_CL_MEM(A, M, N, READ_ONLY);
+  CREATE_CL_MEM(x, N, 1, READ_ONLY);
+  CREATE_CL_MEM(y, M, 1, READ_WRITE);
+  PRE_CLBLAS_CALL;
+  CLBLAS_CHECK(clblasSgemv(clblasRowMajor, clTransA, M, N, alpha,
+      ARRAY(A), ARRAY(x), beta, ARRAY(y),
+      CLBLAS_TRAILING_ARGS));
+}
+
+template <>
+void caffe_opencl_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
+    const int N, const double alpha, const double* A, const double* x,
+    const double beta, double* y) {
+  clblasTranspose clTransA = to_clblasTranspose(TransA);
+  int ldA = (TransA == CblasNoTrans) ? N : M;
+  int ldx = N;
+  int ldy = N;
+  CREATE_CL_MEM(A, M, N, READ_ONLY);
+  CREATE_CL_MEM(x, N, 1, READ_ONLY);
+  CREATE_CL_MEM(y, M, 1, READ_WRITE);
+  PRE_CLBLAS_CALL;
+  CLBLAS_CHECK(clblasDgemv(clblasRowMajor, clTransA, M, N, alpha,
+      ARRAY(A), ARRAY(x), beta, ARRAY(y),
+      CLBLAS_TRAILING_ARGS));
+}
+
+template <>
+void caffe_opencl_axpy<float>(const int N, const float alpha, const float* X,
+    float* Y) {
+  int ldX = N;
+  int ldY = N;
+  CREATE_CL_MEM(X, N, 1, READ_ONLY);
+  CREATE_CL_MEM(Y, N, 1, READ_WRITE);
+  PRE_CLBLAS_CALL;
+  CLBLAS_CHECK(clblasSaxpy(
+      N, alpha, ARRAY(X), ARRAY(Y),
+      CLBLAS_TRAILING_ARGS));
+}
+
+template <>
+void caffe_opencl_axpy<double>(const int N, const double alpha, const double* X,
+    double* Y) {
+  int ldX = N;
+  int ldY = N;
+  CREATE_CL_MEM(X, N, 1, READ_ONLY);
+  CREATE_CL_MEM(Y, N, 1, READ_WRITE);
+  PRE_CLBLAS_CALL;
+  CLBLAS_CHECK(clblasDaxpy(
+      N, alpha, ARRAY(X), ARRAY(Y),
+      CLBLAS_TRAILING_ARGS));
+}
+
+template <>
+void caffe_opencl_copy<float>(const int N, const float* X, float* Y) {
+  int ldX = N;
+  int ldY = N;
+  CREATE_CL_MEM(X, N, 1, READ_ONLY);
+  CREATE_CL_MEM(Y, N, 1, READ_WRITE);
+  PRE_CLBLAS_CALL;
+  CLBLAS_CHECK(clblasScopy(
+      N, ARRAY(X), ARRAY(Y),
+      CLBLAS_TRAILING_ARGS));
+}
+
+template <>
+void caffe_opencl_copy<double>(const int N, const double* X, double* Y) {
+  int ldX = N;
+  int ldY = N;
+  CREATE_CL_MEM(X, N, 1, READ_ONLY);
+  CREATE_CL_MEM(Y, N, 1, READ_WRITE);
+  PRE_CLBLAS_CALL;
+  CLBLAS_CHECK(clblasDcopy(
+      N, ARRAY(X), ARRAY(Y),
+      CLBLAS_TRAILING_ARGS));
+}
+
+template <>
+void caffe_opencl_scal<float>(const int N, const float alpha, float *X) {
+  int ldX = N;
+  CREATE_CL_MEM(X, N, 1, READ_WRITE);
+  PRE_CLBLAS_CALL;
+  CLBLAS_CHECK(clblasSscal(
+      N, alpha, ARRAY(X),
+      CLBLAS_TRAILING_ARGS));
+}
+
+template <>
+void caffe_opencl_scal<double>(const int N, const double alpha, double *X) {
+  int ldX = N;
+  CREATE_CL_MEM(X, N, 1, READ_WRITE);
+  PRE_CLBLAS_CALL;
+  CLBLAS_CHECK(clblasDscal(
+      N, alpha, ARRAY(X),
+      CLBLAS_TRAILING_ARGS));
+}
+
+template <>
+void caffe_opencl_axpby<float>(const int N, const float alpha, const float* X,
+    const float beta, float* Y) {
+  caffe_opencl_scal(N, beta, Y);
+  caffe_opencl_axpy(N, alpha, X, Y);
+}
+
+template <>
+void caffe_opencl_axpby<double>(const int N, const double alpha, const double* X,
+    const double beta, double* Y) {
+  caffe_opencl_scal(N, beta, Y);
+  caffe_opencl_axpy(N, alpha, X, Y);
+}
+
+template<typename Dtype>
+void caffe_opencl_copy_from_cpu(const int N, const Dtype *X, Dtype *Y) {
+  CREATE_CL_MEM(Y, N, 1, READ_WRITE);
+  cl_bool blocking_write = CL_TRUE;
+  cl_uint num_events_in_wait_list = 0;
+  cl_event *event_wait_list = NULL;
+  cl_event events = NULL;
+  CL_CHECK(clEnqueueWriteBuffer(
+      CaffeOpenCL::queue(), bufY, blocking_write, 0, N * sizeof(Dtype),
+      X, num_events_in_wait_list, event_wait_list, &events));
+}
+
+template
+void caffe_opencl_copy_from_cpu<float>(const int N, const float *X, float *Y);
+template
+void caffe_opencl_copy_from_cpu<double>(const int N, const double *X, double *Y);
+
+
+template <typename Dtype>
+void caffe_opencl_set(const int N, const Dtype alpha, Dtype *X) {
+#ifdef CL_VERSION_1_2
+  CREATE_CL_MEM(X, N, 1, READ_WRITE);
+  cl_uint num_events_in_wait_list = 0;
+  cl_event *event_wait_list = NULL;
+  cl_event event = NULL;
+  CL_CHECK(clEnqueueFillBuffer(
+      CaffeOpenCL::queue(), bufX, static_cast<void*>(&alpha), sizeof(Dtype),
+      0, sizeof(Dtype) * N, num_events_in_wait_list, event_wait_list, &event));
+#else
+  std::vector<Dtype> tmp(N, alpha);
+  caffe_opencl_copy_from_cpu(N, &tmp[0], X);
+#endif
+}
+
+template
+void caffe_opencl_set(const int N, const float alpha, float *X);
+template
+void caffe_opencl_set(const int N, const double alpha, double *X);
+
+
+DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sqr, y[i] = x[i] * x[i]);
+DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(exp, y[i] = exp(x[i]));
+DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sign, y[i] = sign<Dtype>(x[i]));
+DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sgnbit, y[i] = signbit(x[i]));
+DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(fabs, y[i] = fabs(x[i]));
+
+DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(add, y[i] = a[i] + b[i]);
+DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(sub, y[i] = a[i] - b[i]);
+DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(mul, y[i] = a[i] * b[i]);
+DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(div, y[i] = a[i] / b[i]);
+
+}  // namespace caffe
+
+#endif  // USE_OPENCL