diff --git a/Makefile b/Makefile index 77e2ff5edbe..41d594851f0 100644 --- a/Makefile +++ b/Makefile @@ -224,6 +224,14 @@ endif INCLUDE_DIRS += $(BLAS_INCLUDE) LIBRARY_DIRS += $(BLAS_LIB) +OPENCL ?= 0 +ifeq ($(OPENCL), 1) + INCLUDE_DIRS += $(OPENCL_INCLUDE_DIR) $(CLBLAS_INCLUDE_DIR) + LIBRARY_DIRS += $(OPENCL_LIB_DIR) $(CLBLAS_LIB_DIR) + LIBRARIES += $(OPENCL_LIBS) $(CLBLAS_LIBS) + COMMON_FLAGS += -DUSE_OPENCL +endif + # Complete build flags. COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) CXXFLAGS += -pthread -fPIC $(COMMON_FLAGS) $(WARNINGS) diff --git a/Makefile.config.example b/Makefile.config.example index 73c3740b1c7..4bb904c54b4 100644 --- a/Makefile.config.example +++ b/Makefile.config.example @@ -46,6 +46,13 @@ PYTHON_INCLUDE := /usr/local/include/python2.7 \ PYTHON_LIB := /usr/local/lib # PYTHON_LIB := $(HOME)/anaconda/lib +OPENCL_INCLUDE_DIR := /opt/AMDAPP/include/ +OPENCL_LIB_DIR := /opt/AMDAPP/lib/x86_64/ +OPENCL_LIBS := OpenCL +CLBLAS_INCLUDE_DIR := /home/user/Codes/clBLAS/src/package/include +CLBLAS_LIB_DIR := /home/user/Codes/clBLAS/src/package/lib64 +CLBLAS_LIBS := clBLAS + # Whatever else you find you need goes here. INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp index c04375a10e2..30593388347 100644 --- a/include/caffe/blob.hpp +++ b/include/caffe/blob.hpp @@ -71,6 +71,12 @@ class Blob { Dtype* mutable_gpu_data(); Dtype* mutable_cpu_diff(); Dtype* mutable_gpu_diff(); + + const Dtype* const_data() const; + const Dtype* const_diff() const; + Dtype* mutable_data(); + Dtype* mutable_diff(); + void Update(); void FromProto(const BlobProto& proto); void ToProto(BlobProto* proto, bool write_diff = false) const; diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index bd4e39f136d..e25ad38d022 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -74,7 +74,7 @@ class Caffe { } return *singleton_; } - enum Brew { CPU, GPU }; + enum Brew { CPU, GPU, OPENCL_CPU, OPENCL_GPU, OPENCL_ALL }; enum Phase { TRAIN, TEST }; diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp index 2c6be551d8f..45112af72bc 100644 --- a/include/caffe/data_layers.hpp +++ b/include/caffe/data_layers.hpp @@ -31,6 +31,10 @@ class HDF5OutputLayer : public Layer { virtual ~HDF5OutputLayer(); virtual void SetUp(const vector*>& bottom, vector*>* top) {} + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom) { return; } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_HDF5_OUTPUT; @@ -42,14 +46,6 @@ class HDF5OutputLayer : public Layer { inline std::string file_name() const { return file_name_; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); virtual void SaveBlobs(); std::string file_name_; @@ -67,6 +63,10 @@ class HDF5DataLayer : public Layer { virtual ~HDF5DataLayer(); virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom) { return; } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_HDF5_DATA; @@ -75,14 +75,6 @@ class HDF5DataLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 2; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} virtual void LoadHDF5FileData(const char* filename); std::vector hdf_filenames_; @@ -111,6 +103,10 @@ class DataLayer : public Layer { virtual ~DataLayer(); virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom) { return; } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_DATA; @@ -120,15 +116,6 @@ class DataLayer : public Layer { virtual inline int MaxTopBlobs() const { return 2; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - virtual void CreatePrefetchThread(); virtual void JoinPrefetchThread(); virtual unsigned int PrefetchRand(); @@ -170,15 +157,12 @@ class DummyDataLayer : public Layer { } virtual inline int ExactNumBottomBlobs() const { return 0; } virtual inline int MinTopBlobs() const { return 1; } - - protected: - virtual Dtype Forward_cpu(const vector*>& bottom, + virtual Dtype Forward(const vector*>& bottom, vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - virtual void Backward_gpu(const vector*>& top, + virtual void Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) {} + protected: vector > > fillers_; vector refill_; }; @@ -198,6 +182,10 @@ class ImageDataLayer : public Layer { virtual ~ImageDataLayer(); virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom) { return; } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_IMAGE_DATA; @@ -206,15 +194,6 @@ class ImageDataLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 2; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - virtual void ShuffleImages(); virtual void CreatePrefetchThread(); @@ -244,6 +223,10 @@ class MemoryDataLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom) {} virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_MEMORY_DATA; @@ -260,13 +243,6 @@ class MemoryDataLayer : public Layer { int batch_size() { return batch_size_; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - Dtype* data_; Dtype* labels_; int datum_channels_; @@ -293,6 +269,10 @@ class WindowDataLayer : public Layer { virtual ~WindowDataLayer(); virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom) { return; } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_WINDOW_DATA; @@ -301,15 +281,6 @@ class WindowDataLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 2; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - virtual void CreatePrefetchThread(); virtual void JoinPrefetchThread(); virtual unsigned int PrefetchRand(); diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp index 242f11a3513..acf34270f6a 100644 --- a/include/caffe/filler.hpp +++ b/include/caffe/filler.hpp @@ -34,7 +34,7 @@ class ConstantFiller : public Filler { explicit ConstantFiller(const FillerParameter& param) : Filler(param) {} virtual void Fill(Blob* blob) { - Dtype* data = blob->mutable_cpu_data(); + Dtype* data = blob->mutable_data(); const int count = blob->count(); const Dtype value = this->filler_param_.value(); CHECK(count); @@ -54,7 +54,7 @@ class UniformFiller : public Filler { virtual void Fill(Blob* blob) { CHECK(blob->count()); caffe_rng_uniform(blob->count(), Dtype(this->filler_param_.min()), - Dtype(this->filler_param_.max()), blob->mutable_cpu_data()); + Dtype(this->filler_param_.max()), blob->mutable_data()); CHECK_EQ(this->filler_param_.sparse(), -1) << "Sparsity not supported by this Filler."; } @@ -66,10 +66,10 @@ class GaussianFiller : public Filler { explicit GaussianFiller(const FillerParameter& param) : Filler(param) {} virtual void Fill(Blob* blob) { - Dtype* data = blob->mutable_cpu_data(); + Dtype* data = blob->mutable_data(); CHECK(blob->count()); caffe_rng_gaussian(blob->count(), Dtype(this->filler_param_.mean()), - Dtype(this->filler_param_.std()), blob->mutable_cpu_data()); + Dtype(this->filler_param_.std()), blob->mutable_data()); int sparse = this->filler_param_.sparse(); CHECK_GE(sparse, -1); if (sparse >= 0) { @@ -82,7 +82,7 @@ class GaussianFiller : public Filler { int num_inputs = blob->height(); Dtype non_zero_probability = Dtype(sparse) / Dtype(num_inputs); rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int))); - int* mask = reinterpret_cast(rand_vec_->mutable_cpu_data()); + int* mask = reinterpret_cast(rand_vec_->mutable_data()); caffe_rng_bernoulli(blob->count(), non_zero_probability, mask); for (int i = 0; i < blob->count(); ++i) { data[i] *= mask[i]; @@ -100,9 +100,9 @@ class PositiveUnitballFiller : public Filler { explicit PositiveUnitballFiller(const FillerParameter& param) : Filler(param) {} virtual void Fill(Blob* blob) { - Dtype* data = blob->mutable_cpu_data(); + Dtype* data = blob->mutable_data(); DCHECK(blob->count()); - caffe_rng_uniform(blob->count(), 0, 1, blob->mutable_cpu_data()); + caffe_rng_uniform(blob->count(), 0, 1, blob->mutable_data()); // We expect the filler to not be called very frequently, so we will // just use a simple implementation int dim = blob->count() / blob->num(); @@ -139,7 +139,7 @@ class XavierFiller : public Filler { int fan_in = blob->count() / blob->num(); Dtype scale = sqrt(Dtype(3) / fan_in); caffe_rng_uniform(blob->count(), -scale, scale, - blob->mutable_cpu_data()); + blob->mutable_data()); CHECK_EQ(this->filler_param_.sparse(), -1) << "Sparsity not supported by this Filler."; } diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp index 690c36ba23f..47dddb6ad22 100644 --- a/include/caffe/layer.hpp +++ b/include/caffe/layer.hpp @@ -9,6 +9,7 @@ #include "caffe/blob.hpp" #include "caffe/common.hpp" #include "caffe/proto/caffe.pb.h" +#include "caffe/util/device.hpp" using std::string; using std::vector; @@ -43,9 +44,9 @@ class Layer { // Forward and backward wrappers. You should implement the cpu and // gpu specific implementations instead, and should not change these // functions. - inline Dtype Forward(const vector*>& bottom, + virtual Dtype Forward(const vector*>& bottom, vector*>* top); - inline void Backward(const vector*>& top, + virtual void Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom); @@ -101,7 +102,7 @@ class Layer { // Forward functions: compute the layer output // (and loss layers return the loss; other layers return the dummy value 0.) virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top) = 0; + vector*>* top) { return static_cast(0); } // If no gpu code is provided, we will simply use cpu code. virtual Dtype Forward_gpu(const vector*>& bottom, vector*>* top) { @@ -113,7 +114,7 @@ class Layer { // for the bottom blobs if propagate_down is true. virtual void Backward_cpu(const vector*>& top, const vector& propagate_down, - vector*>* bottom) = 0; + vector*>* bottom) { return; } virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, vector*>* bottom) { @@ -165,7 +166,7 @@ class Layer { // gpu specific implementations instead, and should not change these // functions. template -inline Dtype Layer::Forward(const vector*>& bottom, +Dtype Layer::Forward(const vector*>& bottom, vector*>* top) { switch (Caffe::mode()) { case Caffe::CPU: @@ -179,7 +180,7 @@ inline Dtype Layer::Forward(const vector*>& bottom, } template -inline void Layer::Backward(const vector*>& top, +void Layer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { switch (Caffe::mode()) { diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp index 3a4d41662fd..ab8a9caa5f0 100644 --- a/include/caffe/loss_layers.hpp +++ b/include/caffe/loss_layers.hpp @@ -63,6 +63,10 @@ class SoftmaxWithLossLayer : public Layer { : Layer(param), softmax_layer_(new SoftmaxLayer(param)) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_SOFTMAX_LOSS; @@ -75,15 +79,6 @@ class SoftmaxWithLossLayer : public Layer { } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - shared_ptr > softmax_layer_; // prob stores the output probability of the layer. Blob prob_; @@ -103,21 +98,16 @@ class SigmoidCrossEntropyLossLayer : public LossLayer { sigmoid_output_(new Blob()) {} virtual void FurtherSetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_SIGMOID_CROSS_ENTROPY_LOSS; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - shared_ptr > sigmoid_layer_; // sigmoid_output stores the output of the sigmoid layer. shared_ptr > sigmoid_output_; @@ -139,6 +129,10 @@ class EuclideanLossLayer : public LossLayer { : LossLayer(param), diff_() {} virtual void FurtherSetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_EUCLIDEAN_LOSS; @@ -150,15 +144,6 @@ class EuclideanLossLayer : public LossLayer { } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - Blob diff_; }; @@ -171,17 +156,16 @@ class InfogainLossLayer : public LossLayer { : LossLayer(param), infogain_() {} virtual void FurtherSetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_INFOGAIN_LOSS; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - Blob infogain_; }; @@ -196,11 +180,9 @@ class HingeLossLayer : public LossLayer { virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_HINGE_LOSS; } - - protected: - virtual Dtype Forward_cpu(const vector*>& bottom, + virtual Dtype Forward(const vector*>& bottom, vector*>* top); - virtual void Backward_cpu(const vector*>& top, + virtual void Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom); }; @@ -213,16 +195,14 @@ class MultinomialLogisticLossLayer : public LossLayer { : LossLayer(param) {} virtual void FurtherSetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS; } - - protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); }; /* AccuracyLayer @@ -236,6 +216,12 @@ class AccuracyLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom) { + NOT_IMPLEMENTED; + } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_ACCURACY; @@ -245,13 +231,6 @@ class AccuracyLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 1; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - NOT_IMPLEMENTED; - } - int top_k_; }; diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp index e52e395e24b..16c8ae64835 100644 --- a/include/caffe/neuron_layers.hpp +++ b/include/caffe/neuron_layers.hpp @@ -120,21 +120,16 @@ class PowerLayer : public NeuronLayer { : NeuronLayer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_POWER; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - Dtype power_; Dtype scale_; Dtype shift_; diff --git a/include/caffe/opencl_syncedmem.hpp b/include/caffe/opencl_syncedmem.hpp new file mode 100644 index 00000000000..22d045ce3d4 --- /dev/null +++ b/include/caffe/opencl_syncedmem.hpp @@ -0,0 +1,77 @@ +// Copyright 2014 BVLC and contributors. + +#ifdef USE_OPENCL +#ifndef CAFFE_OPENCL_SYNCEDMEM_HPP_ +#define CAFFE_OPENCL_SYNCEDMEM_HPP_ + +#include +#ifdef __APPLE__ +#include +#else +#include +#endif + +#include "caffe/common.hpp" +#include "caffe/syncedmem.hpp" +#include "caffe/util/opencl_device.hpp" + +namespace caffe { + + +/* + * https://software.intel.com/sites/products/documentation/ioclsdk/2013/OG/Mapping_Memory_Objects_(USE_HOST_PTR).htm + * For efficiency reasons such a host-side pointer must be allocated for the + * conditions: + * * The amount of memory you allocate and the size of the corresponding + * * OpenCL* buffer must be multiple of the cache line sizes (64 bytes). + * * Always use 4k alignment (page alignment) when you allocate the host memory + * * for sharing with OpenCL devices. + */ +#define OPENCL_CACHE_LINE_SIZE 64 +#define OPENCL_PAGE_ALIGNMENT 4096 + +inline void opencl_aligned_malloc(void** ptr, size_t* size) { + *size += (*size % OPENCL_CACHE_LINE_SIZE); +#ifdef _MSC_VER + *ptr = _aligned_malloc(*size, OPENCL_PAGE_ALIGNMENT); +#else + if(posix_memalign(ptr, OPENCL_PAGE_ALIGNMENT, *size)) { + *ptr = NULL; + } +#endif +} + +inline void opencl_aligned_free(void* ptr) { +#ifdef _MSC_VER + _aligned_free(ptr); +#else + free(ptr); +#endif +} + +class OpenCLSyncedMemory : public AbstractSyncedMemory { + public: + OpenCLSyncedMemory() : AbstractSyncedMemory() {} + explicit OpenCLSyncedMemory(size_t size) : AbstractSyncedMemory(size) {} + ~OpenCLSyncedMemory(); + const void* cpu_data(); + void set_cpu_data(void* data); + const void* gpu_data(); + void* mutable_cpu_data(); + void* mutable_gpu_data(); + protected: + void to_cpu(); + void to_gpu(); + + private: + void* shared_host_ptr_; + void* mapped_device_ptr_; + cl_mem device_mem_; + + DISABLE_COPY_AND_ASSIGN(OpenCLSyncedMemory); +}; // class OpenCLSyncedMemory + +} // namespace caffe + +#endif // CAFFE_OPENCL_SYNCEDMEM_HPP_ +#endif // USE_OPENCL diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp index bed55c3806e..30e7d771ad4 100644 --- a/include/caffe/syncedmem.hpp +++ b/include/caffe/syncedmem.hpp @@ -31,33 +31,56 @@ inline void CaffeFreeHost(void* ptr) { free(ptr); } - -class SyncedMemory { +class AbstractSyncedMemory { public: - SyncedMemory() + AbstractSyncedMemory() : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED), own_cpu_data_(false) {} - explicit SyncedMemory(size_t size) + explicit AbstractSyncedMemory(size_t size) : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED), own_cpu_data_(false) {} - ~SyncedMemory(); + virtual ~AbstractSyncedMemory() {} + enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED }; + virtual const void* cpu_data() = 0; + virtual void set_cpu_data(void* data) = 0; + virtual const void* gpu_data() = 0; + virtual void* mutable_cpu_data() = 0; + virtual void* mutable_gpu_data() = 0; + virtual SyncedHead head() { return head_; } + virtual size_t size() { return size_; } + + virtual const void* const_data() { return NULL; } + virtual void* mutable_data() { return NULL;} + + protected: + virtual void to_cpu() = 0; + virtual void to_gpu() = 0; + + protected: + void* cpu_ptr_; + void* gpu_ptr_; + size_t size_; + SyncedHead head_; + bool own_cpu_data_; +}; + +class SyncedMemory : public AbstractSyncedMemory { + public: + SyncedMemory() : AbstractSyncedMemory() {} + explicit SyncedMemory(size_t size) : AbstractSyncedMemory(size) {} + virtual ~SyncedMemory(); const void* cpu_data(); void set_cpu_data(void* data); const void* gpu_data(); void* mutable_cpu_data(); void* mutable_gpu_data(); - enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED }; - SyncedHead head() { return head_; } - size_t size() { return size_; } - private: + const void* const_data(); + void* mutable_data(); + + protected: void to_cpu(); void to_gpu(); - void* cpu_ptr_; - void* gpu_ptr_; - size_t size_; - SyncedHead head_; - bool own_cpu_data_; DISABLE_COPY_AND_ASSIGN(SyncedMemory); }; // class SyncedMemory diff --git a/include/caffe/syncedmem_factory.hpp b/include/caffe/syncedmem_factory.hpp new file mode 100644 index 00000000000..08ff2f0da09 --- /dev/null +++ b/include/caffe/syncedmem_factory.hpp @@ -0,0 +1,17 @@ +// Copyright 2014 BVLC and contributors. + +#ifndef CAFFE_SYNCEDMEM_FACTORY_HPP_ +#define CAFFE_SYNCEDMEM_FACTORY_HPP_ + +#include "caffe/common.hpp" +#include "caffe/syncedmem.hpp" +#include "caffe/opencl_syncedmem.hpp" + +namespace caffe { + +// The SyncedMemory factory function +AbstractSyncedMemory* GetSyncedMemory(const size_t size = 0); + +} // namespace caffe + +#endif // CAFFE_SYNCEDMEM_FACTORY_HPP_ diff --git a/include/caffe/util/device.hpp b/include/caffe/util/device.hpp new file mode 100644 index 00000000000..ef3e9d5cdbd --- /dev/null +++ b/include/caffe/util/device.hpp @@ -0,0 +1,269 @@ +// Copyright 2014 BVLC and contributors. + +#ifndef CAFFE_UTIL_DEVICE_H_ +#define CAFFE_UTIL_DEVICE_H_ + +#include +#include + +#include "glog/logging.h" + +#include "caffe/util/im2col.hpp" +#include "caffe/util/math_functions.hpp" + +namespace caffe { + +template +class Device { + public: + virtual ~Device() { + } + virtual void gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const Dtype alpha, + const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C) { NOT_IMPLEMENTED; } + + virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, const Dtype* x, + const Dtype beta, Dtype* y) { NOT_IMPLEMENTED; } + + virtual void axpy(const int N, const Dtype alpha, const Dtype* X, + Dtype* Y) { NOT_IMPLEMENTED; } + + virtual void axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y) { NOT_IMPLEMENTED; } + + virtual void copy(const int N, const Dtype *X, Dtype *Y) { NOT_IMPLEMENTED; } + virtual void copy_from_cpu(const int N, const Dtype* X, Dtype* Y) { + NOT_IMPLEMENTED; } + + virtual void set(const int N, const Dtype alpha, Dtype *X) { + NOT_IMPLEMENTED; } + + virtual void add_scalar(const int N, const Dtype alpha, Dtype *X) { + NOT_IMPLEMENTED; } + + virtual void scal(const int N, const Dtype alpha, Dtype *X) { + NOT_IMPLEMENTED; } + + virtual void sqr(const int N, const Dtype* a, Dtype* y) { NOT_IMPLEMENTED; } + + virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y) { + NOT_IMPLEMENTED; } + + virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y) { + NOT_IMPLEMENTED; } + + virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y) { + NOT_IMPLEMENTED; } + + virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y) { + NOT_IMPLEMENTED; } + + virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y) { + NOT_IMPLEMENTED; } + + virtual void rng_uniform(const int N, const Dtype a, const Dtype b, + Dtype* r) { NOT_IMPLEMENTED; } + + virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma, + Dtype* r) { NOT_IMPLEMENTED; } + + virtual void rng_bernoulli(const int N, const Dtype p, int* r) { + NOT_IMPLEMENTED; } + + virtual void exp(const int N, const Dtype* a, Dtype* y) { NOT_IMPLEMENTED; } + + virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out) { + NOT_IMPLEMENTED; } + + virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y, + uint32_t* out) { NOT_IMPLEMENTED; } + +// Returns the sum of the absolute values of the elements of vector x + virtual void asum(const int N, const Dtype* x, Dtype* y) { NOT_IMPLEMENTED; } + + virtual void sign(const int N, const Dtype* x, Dtype* y) { NOT_IMPLEMENTED; } + + virtual void sgnbit(const int N, const Dtype* x, Dtype* y) { + NOT_IMPLEMENTED; } + + virtual void fabs(const int N, const Dtype* x, Dtype* y) { NOT_IMPLEMENTED; } + + virtual void scale(const int N, const Dtype alpha, const Dtype *x, + Dtype* y) { NOT_IMPLEMENTED; } + + virtual void im2col(const Dtype* data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col) { NOT_IMPLEMENTED; } + + virtual void col2im(const Dtype* data_col, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, Dtype* data_im) { NOT_IMPLEMENTED; } +}; + +template +class CPUDevice : public Device { + public: + CPUDevice() { + } + virtual ~CPUDevice() { + } + virtual void gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const Dtype alpha, + const Dtype* A, const Dtype* B, const Dtype beta, Dtype* C); + + virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, const Dtype* x, + const Dtype beta, Dtype* y); + + virtual void axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y); + + virtual void axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y); + + virtual void copy(const int N, const Dtype *X, Dtype *Y); + virtual void copy_from_cpu(const int N, const Dtype* X, Dtype* Y); + + virtual void set(const int N, const Dtype alpha, Dtype *X); + + virtual void add_scalar(const int N, const Dtype alpha, Dtype *X); + + virtual void scal(const int N, const Dtype alpha, Dtype *X); + + virtual void sqr(const int N, const Dtype* a, Dtype* y); + + virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y); + + virtual void rng_uniform(const int N, const Dtype a, const Dtype b, Dtype* r); + + virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma, + Dtype* r); + + virtual void rng_bernoulli(const int N, const Dtype p, int* r); + + virtual void exp(const int N, const Dtype* a, Dtype* y); + + virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out); + + virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y, + uint32_t* out); + +// Returns the sum of the absolute values of the elements of vector x + virtual void asum(const int N, const Dtype* x, Dtype* y); + + virtual void sign(const int N, const Dtype* x, Dtype* y); + + virtual void sgnbit(const int N, const Dtype* x, Dtype* y); + + virtual void fabs(const int N, const Dtype* x, Dtype* y); + + virtual void scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y); + + virtual void im2col(const Dtype* data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col); + + virtual void col2im(const Dtype* data_col, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, Dtype* data_im); +}; + +template +class GPUDevice : public Device { + public: + GPUDevice() { + } + virtual ~GPUDevice() { + } + virtual void gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const Dtype alpha, + const Dtype* A, const Dtype* B, const Dtype beta, Dtype* C); + + virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, const Dtype* x, + const Dtype beta, Dtype* y); + + virtual void axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y); + + virtual void axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y); + + virtual void copy(const int N, const Dtype *X, Dtype *Y); + virtual void copy_from_cpu(const int N, const Dtype* X, Dtype* Y); + + virtual void set(const int N, const Dtype alpha, Dtype *X); + + virtual void add_scalar(const int N, const Dtype alpha, Dtype *X); + + virtual void scal(const int N, const Dtype alpha, Dtype *X); + + virtual void sqr(const int N, const Dtype* a, Dtype* y); + + virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y); + + virtual void rng_uniform(const int N, const Dtype a, const Dtype b, Dtype* r); + + virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma, + Dtype* r); + + virtual void rng_bernoulli(const int N, const Dtype p, int* r); + + virtual void exp(const int N, const Dtype* a, Dtype* y); + + virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out); + + virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y, + uint32_t* out); + +// Returns the sum of the absolute values of the elements of vector x + virtual void asum(const int N, const Dtype* x, Dtype* y); + + virtual void sign(const int N, const Dtype* x, Dtype* y); + + virtual void sgnbit(const int N, const Dtype* x, Dtype* y); + + virtual void fabs(const int N, const Dtype* x, Dtype* y); + + virtual void scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y); + + virtual void im2col(const Dtype* data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col); + + virtual void col2im(const Dtype* data_col, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, Dtype* data_im); +}; + +template +class DeviceFactory { + public: + static Device* GetDevice(); + private: + static Device* cpu_device_; + static Device* gpu_device_; +#ifdef USE_OPENCL + static Device* opencl_device_; +#endif +}; + +} // namespace caffe + +#endif // CAFFE_UTIL_DEVICE_H_ diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp index a649d8cc4e8..809308caf80 100644 --- a/include/caffe/util/im2col.hpp +++ b/include/caffe/util/im2col.hpp @@ -3,6 +3,8 @@ #ifndef _CAFFE_UTIL_IM2COL_HPP_ #define _CAFFE_UTIL_IM2COL_HPP_ +#include "caffe/common.hpp" + namespace caffe { template diff --git a/include/caffe/util/opencl_device.hpp b/include/caffe/util/opencl_device.hpp new file mode 100644 index 00000000000..d976d580916 --- /dev/null +++ b/include/caffe/util/opencl_device.hpp @@ -0,0 +1,178 @@ +// Copyright 2014 BVLC and contributors. + +#ifdef USE_OPENCL +#ifndef CAFFE_UTIL_OPENCL_DEVICE_H_ +#define CAFFE_UTIL_OPENCL_DEVICE_H_ + +#ifdef __APPLE__ +#include +#else +#include +#endif +#include "clBLAS.h" + +#include "glog/logging.h" + +#include "caffe/util/device.hpp" + +#include + +namespace caffe { + +#define CL_CHECK(condition) \ + /* Code block avoids redefinition of cudaError_t error */ \ + do { \ + cl_int error = condition; \ + CHECK_EQ(error, CL_SUCCESS) << " " << clGetErrorString(error); \ + } while (0) + +#define CLBLAS_CHECK(condition) \ + do { \ + clblasStatus status = condition; \ + CHECK_EQ(status, clblasSuccess) << " " \ + << caffe::clblasGetErrorString(status); \ + } while (0) + +const char* clGetErrorString(cl_int error); +const char* clblasGetErrorString(clblasStatus status); + +class CaffeOpenCL { + public: + inline static CaffeOpenCL& Get() { + if (!singleton_.get()) { + singleton_.reset(new CaffeOpenCL()); + } + return *singleton_; + } + + virtual ~CaffeOpenCL() { + } + + void SetDevice(const int device_id); + inline static cl_context context() { + if (Get().cl_context_ == NULL) { + Get().create_context(); + } + return Get().cl_context_; + } + inline static cl_command_queue queue() { + if (Get().cl_command_queue_ == NULL) { + Get().create_queue(); + } + return Get().cl_command_queue_; + } + protected: + cl_device_type get_device_type(); + cl_device_id current_cl_device_id(); + void create_context(); + void release_context(); + void create_queue(); + void release_queue(); + void initialize_clblas(); + void finalize_clblas(); + protected: + static shared_ptr singleton_; + + int current_device_id_; + cl_platform_id current_cl_platform_id_; + cl_int current_platform_device_count_; + std::vector current_platform_device_ids_; + int current_platform_device_id_; + cl_context cl_context_; + cl_command_queue cl_command_queue_; + bool clblas_initialized_; + private: + CaffeOpenCL() : + current_device_id_(0), current_cl_platform_id_(NULL), + current_platform_device_count_(0), current_platform_device_id_(0), + cl_context_(NULL), cl_command_queue_(NULL), clblas_initialized_(false) { + initialize_clblas(); + } + + DISABLE_COPY_AND_ASSIGN(CaffeOpenCL); +}; + + +template +class OpenCLDevice : public Device { + public: + OpenCLDevice() : Device() { + } + + virtual ~OpenCLDevice() { + } + + virtual void gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const Dtype alpha, + const Dtype* A, const Dtype* B, const Dtype beta, Dtype* C); + + virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, const Dtype* x, + const Dtype beta, Dtype* y); + + virtual void axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y); + + virtual void axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y); + + virtual void copy(const int N, const Dtype *X, Dtype *Y); + virtual void copy_from_cpu(const int N, const Dtype* X, Dtype* Y); + + virtual void set(const int N, const Dtype alpha, Dtype *X); + +// virtual void add_scalar(const int N, const Dtype alpha, Dtype *X); + + virtual void scal(const int N, const Dtype alpha, Dtype *X); + +// virtual void sqr(const int N, const Dtype* a, Dtype* y); +// +// virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y); +// +// virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); +// +// virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); +// +// virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +// virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y); + +// virtual void rng_uniform(const int N, const Dtype a, const Dtype b, Dtype* r); +// +// virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma, +// Dtype* r); +// +// virtual void rng_bernoulli(const int N, const Dtype p, int* r); + +// virtual void exp(const int N, const Dtype* a, Dtype* y); + +// virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out); +// +// virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y, +// uint32_t* out); + +// Returns the sum of the absolute values of the elements of vector x +// virtual void asum(const int N, const Dtype* x, Dtype* y); + +// virtual void sign(const int N, const Dtype* x, Dtype* y); + +// virtual void sgnbit(const int N, const Dtype* x, Dtype* y); + +// virtual void fabs(const int N, const Dtype* x, Dtype* y); + +// virtual void scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y); + +// virtual void im2col(const Dtype* data_im, const int channels, +// const int height, const int width, const int ksize, const int pad, +// const int stride, Dtype* data_col); +// +// virtual void col2im(const Dtype* data_col, const int channels, +// const int height, const int width, const int psize, const int pad, +// const int stride, Dtype* data_im); +}; + + +} // namespace caffe + +#endif // CAFFE_UTIL_OPENCL_DEVICE_H_ +#endif // USE_OPENCL + diff --git a/include/caffe/util/opencl_math_functions.hpp b/include/caffe/util/opencl_math_functions.hpp new file mode 100644 index 00000000000..1927c58cc80 --- /dev/null +++ b/include/caffe/util/opencl_math_functions.hpp @@ -0,0 +1,291 @@ +// Copyright 2014 BVLC and contributors. + +#ifdef USE_OPENCL +#ifndef CAFFE_UTIL_OPENCL_MATH_FUNCTIONS_H_ +#define CAFFE_UTIL_OPENCL_MATH_FUNCTIONS_H_ + +#include "caffe/util/opencl_device.hpp" + +namespace caffe { + +#define CREATE_CL_MEM(A, M, K, FLAG) \ + cl_mem buf##A; \ + do { \ + cl_int error; \ + buf##A = clCreateBuffer( \ + CaffeOpenCL::context(), CL_MEM_##FLAG, M * K * sizeof(*A), \ + NULL, &error); \ + CL_CHECK(error); \ + } while(0) + +#define RELEASE_CL_MEM(A) clReleaseMemObject(buf##A) + +#define ENQUEUE_CL_BUFFER(FLAG, A, M, K) \ + CL_CHECK(clEnqueue##FLAG##Buffer( \ + CaffeOpenCL::queue(), buf##A, CL_TRUE, 0, M * K * sizeof(*A), \ + A, 0, NULL, NULL)); + +#define PRE_CLBLAS_CALL \ + cl_uint num_command_queues = 1; \ + cl_uint num_events_in_wait_list = 0; \ + cl_event *event_wait_list = NULL; \ + cl_event events = NULL; \ + cl_command_queue queue = CaffeOpenCL::queue(); + +#define ARRAY(A) buf##A, 0, ld##A + +#define CLBLAS_TRAILING_ARGS \ + num_command_queues, &queue, num_events_in_wait_list, \ + event_wait_list, &events + +#define OPENCL_UNARY_KERNEL(Dtype_str, name_str, operation_str) \ +"template \n" \ +"__kernel void " name_str "( \n" \ +" __global " Dtype_str "* x, \n" \ +" __global " Dtype_str "* y, \n" \ +" const unsigned int count) { \n" \ +" for (int i = get_global_id(0); \n" \ +" i < (count); \n" \ +" i += get_global_size(0)) { \n" \ +" " operation_str "; \n" \ +"} \n" \ +"\n"; + +#define OPENCL_BINARY_KERNEL(Dtype_str, name_str, operation_str) \ +"__kernel void " name_str "( \n" \ +" __global " Dtype_str "* a, \n" \ +" __global " Dtype_str "* b, \n" \ +" __global " Dtype_str "* y, \n" \ +" const unsigned int count) { \n" \ +" for (int i = get_global_id(0); \n" \ +" i < (count); \n" \ +" i += get_global_size(0)) { \n" \ +" " operation_str "; \n" \ +"} \n" \ +"\n" + +// local_size: Number of work items in each local work group +// global_size: Number of total work items +#define DEFINE_LOCAL_AND_GLOBAL_SIZE(n) \ + const size_t local_size = 64; \ + const size_t global_size = (n + local_size - 1) \ + / local_size + + +// https://www.olcf.ornl.gov/tutorials/opencl-vector-addition/ +#define DEFINE_OPENCL_UNARY_FUNC(Dtype, name, operation) \ +template <> \ +void caffe_opencl_##name(const int n, const Dtype *x, Dtype *y) { \ + const char* kernel_source = OPENCL_UNARY_KERNEL(#Dtype, #name, \ + #operation); \ + cl_context context = CaffeOpenCL::context(); \ + cl_command_queue queue = CaffeOpenCL::queue(); \ + cl_int error; \ + const size_t bytes = n * sizeof(Dtype); \ + cl_program program = clCreateProgramWithSource( \ + context, 1, (const char **) & kernel_source, NULL, &error); \ + CL_CHECK(error); \ + clBuildProgram(program, 0, NULL, NULL, NULL, NULL); \ + cl_kernel kernel = clCreateKernel(program, #name, &error); \ + CL_CHECK(error); \ + cl_mem d_x = clCreateBuffer(context, \ + CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, \ + bytes, \ + const_cast(static_cast(x)), \ + &error); \ + cl_mem d_y = clCreateBuffer(context, \ + CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, \ + bytes, static_cast(y), &error); \ + void* mapped_x = clEnqueueMapBuffer( \ + queue, d_x, CL_TRUE, CL_MAP_READ, 0, bytes, 0, NULL, NULL, &error); \ + CL_CHECK(error); \ + CL_CHECK(clEnqueueUnmapMemObject( \ + CaffeOpenCL::queue(), d_x, mapped_x, \ + 0, NULL, NULL)); \ + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_x)); \ + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_y)); \ + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(unsigned int), &n)); \ + DEFINE_LOCAL_AND_GLOBAL_SIZE(n); \ + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, \ + &local_size, 0, NULL, NULL)); \ + CL_CHECK(clFinish(queue)); \ + CL_CHECK(clEnqueueReadBuffer(queue, d_y, CL_TRUE, 0, \ + bytes, y, 0, NULL, NULL )); \ + void* mapped_y = clEnqueueMapBuffer( \ + queue, d_y, CL_TRUE, CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &error); \ + CL_CHECK(error); \ + CL_CHECK(clEnqueueUnmapMemObject( \ + CaffeOpenCL::queue(), d_y, mapped_y, \ + 0, NULL, NULL)); \ + CL_CHECK(clReleaseMemObject(d_x)); \ + CL_CHECK(clReleaseMemObject(d_y)); \ + CL_CHECK(clReleaseProgram(program)); \ + CL_CHECK(clReleaseKernel(kernel)); \ +} + +#define DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(name, operation) \ + DEFINE_OPENCL_UNARY_FUNC(float, name, operation) \ + DEFINE_OPENCL_UNARY_FUNC(double, name, operation) + +#define DEFINE_OPENCL_BINARY_FUNC(Dtype, name, operation) \ +template <> \ +void caffe_opencl_##name(const int n, const Dtype *a, const Dtype *b, \ + Dtype *y) { \ + const char* kernel_source = OPENCL_BINARY_KERNEL(#Dtype, #name, \ + #operation); \ + cl_context context = CaffeOpenCL::context(); \ + cl_command_queue queue = CaffeOpenCL::queue(); \ + cl_int error; \ + const size_t bytes = n * sizeof(Dtype); \ + cl_program program = clCreateProgramWithSource( \ + context, 1, (const char **) & kernel_source, NULL, &error); \ + CL_CHECK(error); \ + clBuildProgram(program, 0, NULL, NULL, NULL, NULL); \ + cl_kernel kernel = clCreateKernel(program, #name, &error); \ + CL_CHECK(error); \ + cl_mem d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, \ + const_cast(static_cast(a)),\ + &error); \ + cl_mem d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, \ + const_cast(static_cast(b)), \ + &error); \ + cl_mem d_y = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, \ + static_cast(y), &error); \ + void* mapped_a = clEnqueueMapBuffer( \ + queue, d_a, CL_TRUE, CL_MAP_READ, 0, bytes, 0, NULL, NULL, &error); \ + CL_CHECK(error); \ + CL_CHECK(clEnqueueUnmapMemObject( \ + CaffeOpenCL::queue(), d_a, mapped_a, \ + 0, NULL, NULL)); \ + void* mapped_b = clEnqueueMapBuffer( \ + queue, d_b, CL_TRUE, CL_MAP_READ, 0, bytes, 0, NULL, NULL, &error); \ + CL_CHECK(error); \ + CL_CHECK(clEnqueueUnmapMemObject( \ + CaffeOpenCL::queue(), d_b, mapped_b, \ + 0, NULL, NULL)); \ + CL_CHECK(clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, \ + bytes, a, 0, NULL, NULL)); \ + CL_CHECK(clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0, \ + bytes, b, 0, NULL, NULL)); \ + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a)); \ + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b)); \ + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_y)); \ + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(unsigned int), &n)); \ + DEFINE_LOCAL_AND_GLOBAL_SIZE(n); \ + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, \ + &local_size, 0, NULL, NULL)); \ + CL_CHECK(clFinish(queue)); \ + void* mapped_y = clEnqueueMapBuffer( \ + queue, d_y, CL_TRUE, CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &error); \ + CL_CHECK(error); \ + CL_CHECK(clEnqueueUnmapMemObject( \ + CaffeOpenCL::queue(), d_y, mapped_y, \ + 0, NULL, NULL)); \ + CL_CHECK(clReleaseMemObject(d_a)); \ + CL_CHECK(clReleaseMemObject(d_b)); \ + CL_CHECK(clReleaseMemObject(d_y)); \ + CL_CHECK(clReleaseProgram(program)); \ + CL_CHECK(clReleaseKernel(kernel)); \ +} + +#define DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(name, operation) \ + DEFINE_OPENCL_BINARY_FUNC(float, name, operation) \ + DEFINE_OPENCL_BINARY_FUNC(double, name, operation) + +inline clblasTranspose to_clblasTranspose(const CBLAS_TRANSPOSE trans) { + switch (trans) { + case CblasNoTrans: + return clblasNoTrans; + case CblasTrans: + return clblasTrans; + case CblasConjTrans: + return clblasConjTrans; + default: + LOG(FATAL) << "Unknown CBLAS_TRANSPOSE " << trans; + } +} + +template +void caffe_opencl_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int n, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C); + + +template +void caffe_opencl_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int n, + const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, + Dtype* y); + +template +void caffe_opencl_axpy(const int n, const Dtype alpha, const Dtype* x, + Dtype* y); + +template +void caffe_opencl_axpby(const int n, const Dtype alpha, const Dtype* x, + const Dtype beta, Dtype* y); + +template +void caffe_opencl_copy(const int n, const Dtype *x, Dtype *y); + +template +void caffe_opencl_set(const int n, const Dtype alpha, Dtype *x); + +template +void caffe_opencl_add_scalar(const int n, const Dtype alpha, Dtype *x); + +template +void caffe_opencl_scal(const int n, const Dtype alpha, Dtype *x); + +template +Dtype caffe_opencl_dot(const int n, const Dtype* x, const Dtype* y); + +template +int caffe_opencl_hamming_distance(const int n, const Dtype* x, const Dtype* y); + +// Returns the sum of the absolute values of the elements of vector x +template +Dtype caffe_opencl_asum(const int n, const Dtype* x); + +template +void caffe_opencl_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); + +template +void caffe_opencl_copy_from_cpu(const int n, const Dtype *x, Dtype *y); + +template +void caffe_opencl_sqr(const int n, const Dtype* x, Dtype* y); + +template +void caffe_opencl_exp(const int n, const Dtype* x, Dtype* y); + +template +void caffe_opencl_sign(const int n, const Dtype* x, Dtype* y); + +template +void caffe_opencl_sgnbit(const int n, const Dtype* x, Dtype* y); + +template +void caffe_opencl_fabs(const int n, const Dtype* x, Dtype* y); + +template +void caffe_opencl_add(const int n, const Dtype* a, + const Dtype* b, Dtype* y); + +template +void caffe_opencl_sub(const int n, const Dtype* a, + const Dtype* b, Dtype* y); + +template +void caffe_opencl_mul(const int n, const Dtype* a, + const Dtype* b, Dtype* y); + +template +void caffe_opencl_div(const int n, const Dtype* a, + const Dtype* b, Dtype* y); +} // namespace caffe + + +#endif // CAFFE_UTIL_MATH_FUNCTIONS_H_ +#endif // USE_OPENCL + diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index b68dcbf6e83..abe6f0c6483 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -33,6 +33,12 @@ class ArgMaxLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom) { + NOT_IMPLEMENTED; + } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_ARGMAX; @@ -41,12 +47,6 @@ class ArgMaxLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 1; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - NOT_IMPLEMENTED; - } bool out_max_val_; }; @@ -61,6 +61,11 @@ class ConcatLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, + vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_CONCAT; @@ -69,15 +74,6 @@ class ConcatLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 1; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - Blob col_bob_; int count_; int num_; @@ -96,6 +92,10 @@ class ConvolutionLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_CONVOLUTION; @@ -104,15 +104,6 @@ class ConvolutionLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 1; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - int kernel_size_; int stride_; int num_; @@ -140,6 +131,10 @@ class EltwiseLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_ELTWISE; @@ -148,15 +143,6 @@ class EltwiseLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 1; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - EltwiseParameter_EltwiseOp op_; vector coeffs_; }; @@ -170,6 +156,10 @@ class FlattenLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_FLATTEN; @@ -178,15 +168,6 @@ class FlattenLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 1; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - int count_; }; @@ -199,6 +180,10 @@ class Im2colLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_IM2COL; @@ -207,15 +192,6 @@ class Im2colLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 1; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - int kernel_size_; int stride_; int channels_; @@ -233,6 +209,10 @@ class InnerProductLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_INNER_PRODUCT; @@ -241,15 +221,6 @@ class InnerProductLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 1; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - int M_; int K_; int N_; @@ -414,23 +385,12 @@ class SplitLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); - - virtual inline LayerParameter_LayerType type() const { - return LayerParameter_LayerType_SPLIT; - } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int MinTopBlobs() const { return 1; } - - protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, + virtual Dtype Forward(const vector*>& bottom, vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, + virtual void Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom); + protected: int count_; }; diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp index e603712fd82..64d77bdbfeb 100644 --- a/src/caffe/blob.cpp +++ b/src/caffe/blob.cpp @@ -96,6 +96,58 @@ Dtype* Blob::mutable_gpu_diff() { return reinterpret_cast(diff_->mutable_gpu_data()); } +template +const Dtype* Blob::const_data() const { + switch (Caffe::mode()) { + case Caffe::CPU: + return cpu_data(); + case Caffe::GPU: + return gpu_data(); + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast(0); + } +} + +template +const Dtype* Blob::const_diff() const { + switch (Caffe::mode()) { + case Caffe::CPU: + return cpu_diff(); + case Caffe::GPU: + return gpu_diff(); + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast(0); + } +} + +template +Dtype* Blob::mutable_data() { + switch (Caffe::mode()) { + case Caffe::CPU: + return mutable_cpu_data(); + case Caffe::GPU: + return mutable_gpu_data(); + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast(NULL); + } +} + +template +Dtype* Blob::mutable_diff() { + switch (Caffe::mode()) { + case Caffe::CPU: + return mutable_cpu_diff(); + case Caffe::GPU: + return mutable_gpu_diff(); + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast(NULL); + } +} + template void Blob::ShareData(const Blob& other) { CHECK_EQ(count_, other.count()); diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index 631c8afd068..82d4f16cd70 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -10,7 +10,6 @@ namespace caffe { shared_ptr Caffe::singleton_; - // curand seeding int64_t cluster_seedgen(void) { int64_t s, seed, pid; diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp index 409965519ca..4c68c4a8fd4 100644 --- a/src/caffe/layers/accuracy_layer.cpp +++ b/src/caffe/layers/accuracy_layer.cpp @@ -30,11 +30,11 @@ void AccuracyLayer::SetUp( } template -Dtype AccuracyLayer::Forward_cpu(const vector*>& bottom, +Dtype AccuracyLayer::Forward(const vector*>& bottom, vector*>* top) { Dtype accuracy = 0; - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* bottom_label = bottom[1]->cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + const Dtype* bottom_label = bottom[1]->const_data(); int num = bottom[0]->num(); int dim = bottom[0]->count() / bottom[0]->num(); vector maxval(top_k_+1); @@ -62,7 +62,7 @@ Dtype AccuracyLayer::Forward_cpu(const vector*>& bottom, } // LOG(INFO) << "Accuracy: " << accuracy; - (*top)[0]->mutable_cpu_data()[0] = accuracy / num; + (*top)[0]->mutable_data()[0] = accuracy / num; // Accuracy layer should not be used as a loss function. return Dtype(0); diff --git a/src/caffe/layers/argmax_layer.cpp b/src/caffe/layers/argmax_layer.cpp index cc31c0f52d8..1c0c402d5ea 100644 --- a/src/caffe/layers/argmax_layer.cpp +++ b/src/caffe/layers/argmax_layer.cpp @@ -24,10 +24,10 @@ void ArgMaxLayer::SetUp(const vector*>& bottom, } template -Dtype ArgMaxLayer::Forward_cpu(const vector*>& bottom, +Dtype ArgMaxLayer::Forward(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); int num = bottom[0]->num(); int dim = bottom[0]->count() / bottom[0]->num(); for (int i = 0; i < num; ++i) { diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp index 95e6bd8748c..3ff9e3f99df 100644 --- a/src/caffe/layers/bnll_layer.cpp +++ b/src/caffe/layers/bnll_layer.cpp @@ -15,8 +15,8 @@ const float kBNLL_THRESHOLD = 50.; template Dtype BNLLLayer::Forward_cpu(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); const int count = bottom[0]->count(); for (int i = 0; i < count; ++i) { top_data[i] = bottom_data[i] > 0 ? @@ -31,9 +31,9 @@ void BNLLLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[0]) { - const Dtype* bottom_data = (*bottom)[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + const Dtype* bottom_data = (*bottom)[0]->const_data(); + const Dtype* top_diff = top[0]->const_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); const int count = (*bottom)[0]->count(); Dtype expval; for (int i = 0; i < count; ++i) { diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp index 4c894ddffc4..1b90181f20a 100644 --- a/src/caffe/layers/concat_layer.cpp +++ b/src/caffe/layers/concat_layer.cpp @@ -4,7 +4,7 @@ #include "caffe/layer.hpp" #include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" +#include "caffe/util/device.hpp" namespace caffe { @@ -41,63 +41,68 @@ void ConcatLayer::SetUp(const vector*>& bottom, } template -Dtype ConcatLayer::Forward_cpu(const vector*>& bottom, - vector*>* top) { - Dtype* top_data = (*top)[0]->mutable_cpu_data(); +Dtype ConcatLayer::Forward(const vector*>& bottom, + vector*>* top) { + Dtype* top_data = (*top)[0]->mutable_data(); if (concat_dim_== 0) { int offset_num = 0; for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->cpu_data(); + const Dtype* bottom_data = bottom[i]->const_data(); int num_elem = bottom[i]->count(); - caffe_copy(num_elem, bottom_data, top_data+(*top)[0]->offset(offset_num)); + DeviceFactory::GetDevice()->copy(num_elem, bottom_data, + top_data+(*top)[0]->offset(offset_num)); offset_num += bottom[i]->num(); } } else if (concat_dim_ == 1) { int offset_channel = 0; for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->cpu_data(); + const Dtype* bottom_data = bottom[i]->const_data(); int num_elem = bottom[i]->channels()*bottom[i]->height()*bottom[i]->width(); for (int n = 0; n < num_; ++n) { - caffe_copy(num_elem, bottom_data+bottom[i]->offset(n), + DeviceFactory::GetDevice()->copy( + num_elem, bottom_data+bottom[i]->offset(n), top_data+(*top)[0]->offset(n, offset_channel)); } offset_channel += bottom[i]->channels(); - } // concat_dim_ is guaranteed to be 0 or 1 by SetUp. + } + } else { + LOG(FATAL) << "concat_dim along dim" << concat_dim_ << + " not implemented yet"; } return Dtype(0.); } template -void ConcatLayer::Backward_cpu(const vector*>& top, +void ConcatLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); + const Dtype* top_diff = top[0]->const_diff(); if (concat_dim_ == 0) { int offset_num = 0; for (int i = 0; i < bottom->size(); ++i) { Blob* blob = (*bottom)[i]; - if (propagate_down[i]) { - Dtype* bottom_diff = blob->mutable_cpu_diff(); - caffe_copy(blob->count(), top_diff + top[0]->offset(offset_num), - bottom_diff); - } + Dtype* bottom_diff = blob->mutable_diff(); + DeviceFactory::GetDevice()->copy(blob->count(), + top_diff+top[0]->offset(offset_num), bottom_diff); offset_num += blob->num(); } } else if (concat_dim_ == 1) { int offset_channel = 0; for (int i = 0; i < bottom->size(); ++i) { Blob* blob = (*bottom)[i]; - if (propagate_down[i]) { - Dtype* bottom_diff = blob->mutable_cpu_diff(); - int num_elem = blob->channels()*blob->height()*blob->width(); - for (int n = 0; n < num_; ++n) { - caffe_copy(num_elem, top_diff + top[0]->offset(n, offset_channel), - bottom_diff + blob->offset(n)); - } + Dtype* bottom_diff = blob->mutable_diff(); + int num_elem = blob->channels()*blob->height()*blob->width(); + for (int n = 0; n < num_; ++n) { + DeviceFactory::GetDevice()->copy( + num_elem, top_diff+top[0]->offset(n, offset_channel), + bottom_diff+blob->offset(n)); } offset_channel += blob->channels(); } - } // concat_dim_ is guaranteed to be 0 or 1 by SetUp. + } else { + LOG(FATAL) << "concat_dim along dim" << concat_dim_ << + " not implemented yet"; + } } INSTANTIATE_CLASS(ConcatLayer); diff --git a/src/caffe/layers/concat_layer.cu b/src/caffe/layers/concat_layer.cu deleted file mode 100644 index ca0cf0c1b5b..00000000000 --- a/src/caffe/layers/concat_layer.cu +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -Dtype ConcatLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - Dtype* top_data = (*top)[0]->mutable_gpu_data(); - if (concat_dim_ == 0) { - int offset_num = 0; - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - caffe_gpu_copy(bottom[i]->count(), bottom_data, - top_data + (*top)[0]->offset(offset_num)); - offset_num += bottom[i]->num(); - } - } else if (concat_dim_ == 1) { - int offset_channel = 0; - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - int num_elem = - bottom[i]->channels() * bottom[i]->height() * bottom[i]->width(); - for (int n = 0; n < num_; ++n) { - caffe_gpu_copy(num_elem, bottom_data+bottom[i]->offset(n), - top_data + (*top)[0]->offset(n, offset_channel)); - } - offset_channel += bottom[i]->channels(); - } - } else { - LOG(FATAL) << "concat_dim along dim" << concat_dim_ << - " not implemented yet"; - } - return Dtype(0.); -} - -template -void ConcatLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - if (concat_dim_ == 0) { - int offset_num = 0; - for (int i = 0; i < bottom->size(); ++i) { - Blob* blob = (*bottom)[i]; - if (propagate_down[i]) { - Dtype* bottom_diff = blob->mutable_gpu_diff(); - caffe_gpu_copy(blob->count(), top_diff + top[0]->offset(offset_num), - bottom_diff); - } - offset_num += blob->num(); - } - } else if (concat_dim_ == 1) { - int offset_channel = 0; - for (int i = 0; i < bottom->size(); ++i) { - Blob* blob = (*bottom)[i]; - if (propagate_down[i]) { - Dtype* bottom_diff = blob->mutable_gpu_diff(); - int num_elem = blob->channels()*blob->height()*blob->width(); - for (int n = 0; n < num_; ++n) { - caffe_gpu_copy(num_elem, top_diff + top[0]->offset(n, offset_channel), - bottom_diff + blob->offset(n)); - } - } - offset_channel += blob->channels(); - } - } else { - LOG(FATAL) << "concat_dim along dim" << concat_dim_ << - " not implemented yet"; - } -} - -INSTANTIATE_CLASS(ConcatLayer); - -} // namespace caffe diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 9ec8da47e1a..f791d32affb 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -68,7 +68,7 @@ void ConvolutionLayer::SetUp(const vector*>& bottom, if (bias_term_) { bias_multiplier_.reset(new SyncedMemory(N_ * sizeof(Dtype))); Dtype* bias_multiplier_data = - reinterpret_cast(bias_multiplier_->mutable_cpu_data()); + reinterpret_cast(bias_multiplier_->mutable_data()); for (int i = 0; i < N_; ++i) { bias_multiplier_data[i] = 1.; } @@ -77,30 +77,33 @@ void ConvolutionLayer::SetUp(const vector*>& bottom, template -Dtype ConvolutionLayer::Forward_cpu(const vector*>& bottom, +Dtype ConvolutionLayer::Forward(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); - Dtype* col_data = col_buffer_.mutable_cpu_data(); - const Dtype* weight = this->blobs_[0]->cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); + Dtype* col_data = col_buffer_.mutable_data(); + const Dtype* weight = this->blobs_[0]->const_data(); int weight_offset = M_ * K_; int col_offset = K_ * N_; int top_offset = M_ * N_; for (int n = 0; n < num_; ++n) { // First, im2col - im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, - width_, kernel_size_, pad_, stride_, col_data); + DeviceFactory::GetDevice()->im2col( + bottom_data + bottom[0]->offset(n), channels_, height_, + width_, kernel_size_, pad_, stride_, col_data); // Second, innerproduct with groups for (int g = 0; g < group_; ++g) { - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, K_, - (Dtype)1., weight + weight_offset * g, col_data + col_offset * g, - (Dtype)0., top_data + (*top)[0]->offset(n) + top_offset * g); + DeviceFactory::GetDevice()->gemm( + CblasNoTrans, CblasNoTrans, M_, N_, K_, + (Dtype)1., weight + weight_offset * g, col_data + col_offset * g, + (Dtype)0., top_data + (*top)[0]->offset(n) + top_offset * g); } // third, add bias if (bias_term_) { - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, - N_, 1, (Dtype)1., this->blobs_[1]->cpu_data(), - reinterpret_cast(bias_multiplier_->cpu_data()), + DeviceFactory::GetDevice()->gemm( + CblasNoTrans, CblasNoTrans, num_output_, + N_, 1, (Dtype)1., this->blobs_[1]->const_data(), + reinterpret_cast(bias_multiplier_->const_data()), (Dtype)1., top_data + (*top)[0]->offset(n)); } } @@ -108,25 +111,26 @@ Dtype ConvolutionLayer::Forward_cpu(const vector*>& bottom, } template -void ConvolutionLayer::Backward_cpu(const vector*>& top, +void ConvolutionLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - const Dtype* weight = this->blobs_[0]->cpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); - const Dtype* bottom_data = (*bottom)[0]->cpu_data(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); - Dtype* col_data = col_buffer_.mutable_cpu_data(); - Dtype* col_diff = col_buffer_.mutable_cpu_diff(); + const Dtype* top_diff = top[0]->const_diff(); + const Dtype* weight = this->blobs_[0]->const_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_diff(); + const Dtype* bottom_data = (*bottom)[0]->const_data(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); + Dtype* col_data = col_buffer_.mutable_data(); + Dtype* col_diff = col_buffer_.mutable_diff(); // bias gradient if necessary Dtype* bias_diff = NULL; if (bias_term_) { - bias_diff = this->blobs_[1]->mutable_cpu_diff(); - memset(bias_diff, 0, sizeof(Dtype) * this->blobs_[1]->count()); + bias_diff = this->blobs_[1]->mutable_diff(); + DeviceFactory::GetDevice()->set( + this->blobs_[1]->count(), 0, bias_diff); for (int n = 0; n < num_; ++n) { - caffe_cpu_gemv(CblasNoTrans, num_output_, N_, + DeviceFactory::GetDevice()->gemv(CblasNoTrans, num_output_, N_, 1., top_diff + top[0]->offset(n), - reinterpret_cast(bias_multiplier_->cpu_data()), 1., + reinterpret_cast(bias_multiplier_->const_data()), 1., bias_diff); } } @@ -134,29 +138,34 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, int weight_offset = M_ * K_; int col_offset = K_ * N_; int top_offset = M_ * N_; - memset(weight_diff, 0, sizeof(Dtype) * this->blobs_[0]->count()); + DeviceFactory::GetDevice()->set( + this->blobs_[0]->count(), 0, weight_diff); for (int n = 0; n < num_; ++n) { // since we saved memory in the forward pass by not storing all col data, // we will need to recompute them. - im2col_cpu(bottom_data + (*bottom)[0]->offset(n), channels_, height_, - width_, kernel_size_, pad_, stride_, col_data); + DeviceFactory::GetDevice()->im2col( + bottom_data + (*bottom)[0]->offset(n), channels_, height_, + width_, kernel_size_, pad_, stride_, col_data); // gradient w.r.t. weight. Note that we will accumulate diffs. for (int g = 0; g < group_; ++g) { - caffe_cpu_gemm(CblasNoTrans, CblasTrans, M_, K_, N_, - (Dtype)1., top_diff + top[0]->offset(n) + top_offset * g, - col_data + col_offset * g, (Dtype)1., - weight_diff + weight_offset * g); + DeviceFactory::GetDevice()->gemm( + CblasNoTrans, CblasTrans, M_, K_, N_, + (Dtype)1., top_diff + top[0]->offset(n) + top_offset * g, + col_data + col_offset * g, (Dtype)1., + weight_diff + weight_offset * g); } // gradient w.r.t. bottom data, if necessary if (propagate_down[0]) { for (int g = 0; g < group_; ++g) { - caffe_cpu_gemm(CblasTrans, CblasNoTrans, K_, N_, M_, - (Dtype)1., weight + weight_offset * g, - top_diff + top[0]->offset(n) + top_offset * g, - (Dtype)0., col_diff + col_offset * g); + DeviceFactory::GetDevice()->gemm( + CblasTrans, CblasNoTrans, K_, N_, M_, + (Dtype)1., weight + weight_offset * g, + top_diff + top[0]->offset(n) + top_offset * g, + (Dtype)0., col_diff + col_offset * g); } // col2im back to the data - col2im_cpu(col_diff, channels_, height_, width_, kernel_size_, pad_, + DeviceFactory::GetDevice()->col2im( + col_diff, channels_, height_, width_, kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n)); } } diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu deleted file mode 100644 index 85f95fd32c9..00000000000 --- a/src/caffe/layers/conv_layer.cu +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/filler.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -Dtype ConvolutionLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = (*top)[0]->mutable_gpu_data(); - Dtype* col_data = col_buffer_.mutable_gpu_data(); - const Dtype* weight = this->blobs_[0]->gpu_data(); - int weight_offset = M_ * K_; - int col_offset = K_ * N_; - int top_offset = M_ * N_; - for (int n = 0; n < num_; ++n) { - // First, im2col - im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_, - width_, kernel_size_, pad_, stride_, col_data); - // Second, innerproduct with groups - for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, K_, - (Dtype)1., weight + weight_offset * g, col_data + col_offset * g, - (Dtype)0., top_data + (*top)[0]->offset(n) + top_offset * g); - } - // third, add bias - if (bias_term_) { - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, - N_, 1, (Dtype)1., this->blobs_[1]->gpu_data(), - reinterpret_cast(bias_multiplier_->gpu_data()), - (Dtype)1., top_data + (*top)[0]->offset(n)); - } - } - return Dtype(0.); -} - -template -void ConvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* weight = this->blobs_[0]->gpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - const Dtype* bottom_data = (*bottom)[0]->gpu_data(); - Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff(); - Dtype* col_data = col_buffer_.mutable_gpu_data(); - Dtype* col_diff = col_buffer_.mutable_gpu_diff(); - // bias gradient if necessary - Dtype* bias_diff = NULL; - - if (bias_term_) { - bias_diff = this->blobs_[1]->mutable_gpu_diff(); - CUDA_CHECK(cudaMemset(bias_diff, 0, - sizeof(Dtype) * this->blobs_[1]->count())); - for (int n = 0; n < num_; ++n) { - caffe_gpu_gemv(CblasNoTrans, num_output_, N_, - 1., top_diff + top[0]->offset(n), - reinterpret_cast(bias_multiplier_->gpu_data()), - 1., bias_diff); - } - } - - int weight_offset = M_ * K_; - int col_offset = K_ * N_; - int top_offset = M_ * N_; - CUDA_CHECK(cudaMemset(weight_diff, 0, - sizeof(Dtype) * this->blobs_[0]->count())); - for (int n = 0; n < num_; ++n) { - // since we saved memory in the forward pass by not storing all col data, - // we will need to recompute them. - im2col_gpu(bottom_data + (*bottom)[0]->offset(n), channels_, height_, - width_, kernel_size_, pad_, stride_, col_data); - // gradient w.r.t. weight. Note that we will accumulate diffs. - for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm(CblasNoTrans, CblasTrans, M_, K_, N_, - (Dtype)1., top_diff + top[0]->offset(n) + top_offset * g, - col_data + col_offset * g, (Dtype)1., - weight_diff + weight_offset * g); - } - // gradient w.r.t. bottom data, if necessary - if (propagate_down[0]) { - for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm(CblasTrans, CblasNoTrans, K_, N_, M_, - (Dtype)1., weight + weight_offset * g, - top_diff + top[0]->offset(n) + top_offset * g, - (Dtype)0., col_diff + col_offset * g); - } - // col2im back to the data - col2im_gpu(col_diff, channels_, height_, width_, kernel_size_, pad_, - stride_, bottom_diff + (*bottom)[0]->offset(n)); - } - } -} - - -INSTANTIATE_CLASS(ConvolutionLayer); - -} // namespace caffe diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp index 29c4fec8ca4..c3503d838cd 100644 --- a/src/caffe/layers/data_layer.cpp +++ b/src/caffe/layers/data_layer.cpp @@ -25,10 +25,10 @@ void* DataLayerPrefetch(void* layer_pointer) { CHECK(layer); Datum datum; CHECK(layer->prefetch_data_); - Dtype* top_data = layer->prefetch_data_->mutable_cpu_data(); + Dtype* top_data = layer->prefetch_data_->mutable_data(); Dtype* top_label = NULL; // suppress warnings about uninitialized variables if (layer->output_labels_) { - top_label = layer->prefetch_label_->mutable_cpu_data(); + top_label = layer->prefetch_label_->mutable_data(); } const Dtype scale = layer->layer_param_.data_param().scale(); const int batch_size = layer->layer_param_.data_param().batch_size(); @@ -44,7 +44,7 @@ void* DataLayerPrefetch(void* layer_pointer) { const int height = layer->datum_height_; const int width = layer->datum_width_; const int size = layer->datum_size_; - const Dtype* mean = layer->data_mean_.cpu_data(); + const Dtype* mean = layer->data_mean_.const_data(); for (int item_id = 0; item_id < batch_size; ++item_id) { // get a blob switch (layer->layer_param_.data_param().backend()) { @@ -302,14 +302,14 @@ void DataLayer::SetUp(const vector*>& bottom, data_mean_.Reshape(1, datum_channels_, datum_height_, datum_width_); } // Now, start the prefetch thread. Before calling prefetch, we make two - // cpu_data calls so that the prefetch thread does not accidentally make + // const_data calls so that the prefetch thread does not accidentally make // simultaneous cudaMalloc calls when the main thread is running. In some // GPUs this seems to cause failures if we do not so. - prefetch_data_->mutable_cpu_data(); + prefetch_data_->mutable_data(); if (output_labels_) { - prefetch_label_->mutable_cpu_data(); + prefetch_label_->mutable_data(); } - data_mean_.cpu_data(); + data_mean_.const_data(); DLOG(INFO) << "Initializing prefetch"; CreatePrefetchThread(); DLOG(INFO) << "Prefetch initialized."; @@ -346,16 +346,18 @@ unsigned int DataLayer::PrefetchRand() { } template -Dtype DataLayer::Forward_cpu(const vector*>& bottom, +Dtype DataLayer::Forward(const vector*>& bottom, vector*>* top) { // First, join the thread JoinPrefetchThread(); // Copy the data - caffe_copy(prefetch_data_->count(), prefetch_data_->cpu_data(), - (*top)[0]->mutable_cpu_data()); + DeviceFactory::GetDevice()->copy_from_cpu( + prefetch_data_->count(), prefetch_data_->const_data(), + (*top)[0]->mutable_data()); if (output_labels_) { - caffe_copy(prefetch_label_->count(), prefetch_label_->cpu_data(), - (*top)[1]->mutable_cpu_data()); + DeviceFactory::GetDevice()->copy_from_cpu( + prefetch_label_->count(), prefetch_label_->const_data(), + (*top)[1]->mutable_data()); } // Start a new prefetch thread CreatePrefetchThread(); diff --git a/src/caffe/layers/data_layer.cu b/src/caffe/layers/data_layer.cu deleted file mode 100644 index 2ff9a292b3e..00000000000 --- a/src/caffe/layers/data_layer.cu +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include -#include -#include - -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/vision_layers.hpp" - -using std::string; - -namespace caffe { - -template -Dtype DataLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - // First, join the thread - JoinPrefetchThread(); - // Copy the data - CUDA_CHECK(cudaMemcpy((*top)[0]->mutable_gpu_data(), - prefetch_data_->cpu_data(), sizeof(Dtype) * prefetch_data_->count(), - cudaMemcpyHostToDevice)); - if (output_labels_) { - CUDA_CHECK(cudaMemcpy((*top)[1]->mutable_gpu_data(), - prefetch_label_->cpu_data(), sizeof(Dtype) * prefetch_label_->count(), - cudaMemcpyHostToDevice)); - } - // Start a new prefetch thread - CreatePrefetchThread(); - return Dtype(0.); -} - -INSTANTIATE_CLASS(DataLayer); - -} // namespace caffe diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index e9a1a524d63..eafb74aaae7 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -29,18 +29,20 @@ void DropoutLayer::SetUp(const vector*>& bottom, template Dtype DropoutLayer::Forward_cpu(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); - unsigned int* mask = rand_vec_->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); + int* mask = reinterpret_cast(rand_vec_->mutable_data()); const int count = bottom[0]->count(); if (Caffe::phase() == Caffe::TRAIN) { // Create random numbers - caffe_rng_bernoulli(count, 1. - threshold_, mask); + DeviceFactory::GetDevice()->rng_bernoulli(count, 1. - threshold_, + mask); for (int i = 0; i < count; ++i) { top_data[i] = bottom_data[i] * mask[i] * scale_; } } else { - caffe_copy(bottom[0]->count(), bottom_data, top_data); + DeviceFactory::GetDevice()->copy(bottom[0]->count(), bottom_data, + top_data); } return Dtype(0); } @@ -51,9 +53,9 @@ void DropoutLayer::Backward_cpu(const vector*>& top, vector*>* bottom) { CHECK(Caffe::phase() == Caffe::TRAIN); if (propagate_down[0]) { - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); - const unsigned int* mask = rand_vec_->cpu_data(); + const Dtype* top_diff = top[0]->const_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); + const unsigned int* mask = rand_vec_->const_data(); const int count = (*bottom)[0]->count(); for (int i = 0; i < count; ++i) { bottom_diff[i] = top_diff[i] * mask[i] * scale_; @@ -61,8 +63,6 @@ void DropoutLayer::Backward_cpu(const vector*>& top, } } - INSTANTIATE_CLASS(DropoutLayer); - } // namespace caffe diff --git a/src/caffe/layers/dummy_data_layer.cpp b/src/caffe/layers/dummy_data_layer.cpp index 58044f4c952..a4b0b17a2e4 100644 --- a/src/caffe/layers/dummy_data_layer.cpp +++ b/src/caffe/layers/dummy_data_layer.cpp @@ -84,7 +84,7 @@ void DummyDataLayer::SetUp(const vector*>& bottom, } template -Dtype DummyDataLayer::Forward_cpu(const vector*>& bottom, +Dtype DummyDataLayer::Forward(const vector*>& bottom, vector*>* top) { for (int i = 0; i < top->size(); ++i) { const int filler_id = (fillers_.size() > 1) ? i : 0; diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp index 2c265f6678f..adf879e6842 100644 --- a/src/caffe/layers/eltwise_layer.cpp +++ b/src/caffe/layers/eltwise_layer.cpp @@ -41,22 +41,25 @@ void EltwiseLayer::SetUp(const vector*>& bottom, } template -Dtype EltwiseLayer::Forward_cpu( +Dtype EltwiseLayer::Forward( const vector*>& bottom, vector*>* top) { const int count = (*top)[0]->count(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + Dtype* top_data = (*top)[0]->mutable_data(); switch (op_) { case EltwiseParameter_EltwiseOp_PROD: - caffe_mul(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), top_data); + DeviceFactory::GetDevice()->mul(count, bottom[0]->const_data(), + bottom[1]->const_data(), top_data); for (int i = 2; i < bottom.size(); ++i) { - caffe_mul(count, top_data, bottom[i]->cpu_data(), top_data); + DeviceFactory::GetDevice()->mul( + count, top_data, bottom[i]->const_data(), top_data); } break; case EltwiseParameter_EltwiseOp_SUM: - caffe_set(count, Dtype(0), top_data); + DeviceFactory::GetDevice()->set(count, Dtype(0), top_data); // TODO(shelhamer) does BLAS optimize to sum for coeff = 1? for (int i = 0; i < bottom.size(); ++i) { - caffe_axpy(count, coeffs_[i], bottom[i]->cpu_data(), top_data); + DeviceFactory::GetDevice()->axpy( + count, coeffs_[i], bottom[i]->const_data(), top_data); } break; default: @@ -66,25 +69,29 @@ Dtype EltwiseLayer::Forward_cpu( } template -void EltwiseLayer::Backward_cpu(const vector*>& top, +void EltwiseLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { const int count = top[0]->count(); - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); + const Dtype* top_data = top[0]->const_data(); + const Dtype* top_diff = top[0]->const_diff(); for (int i = 0; i < bottom->size(); ++i) { if (propagate_down[i]) { - const Dtype* bottom_data = (*bottom)[i]->cpu_data(); - Dtype* bottom_diff = (*bottom)[i]->mutable_cpu_diff(); + const Dtype* bottom_data = (*bottom)[i]->const_data(); + Dtype* bottom_diff = (*bottom)[i]->mutable_diff(); switch (op_) { case EltwiseParameter_EltwiseOp_PROD: - caffe_div(count, top_data, bottom_data, bottom_diff); - caffe_mul(count, bottom_diff, top_diff, bottom_diff); + DeviceFactory::GetDevice()->div( + count, top_data, bottom_data, bottom_diff); + DeviceFactory::GetDevice()->mul( + count, bottom_diff, top_diff, bottom_diff); break; case EltwiseParameter_EltwiseOp_SUM: if (coeffs_[i] == Dtype(1)) { - caffe_copy(count, top_diff, bottom_diff); + DeviceFactory::GetDevice()->copy( + count, top_diff, bottom_diff); } else { - caffe_cpu_scale(count, coeffs_[i], top_diff, bottom_diff); + DeviceFactory::GetDevice()->scale( + count, coeffs_[i], top_diff, bottom_diff); } break; default: diff --git a/src/caffe/layers/eltwise_layer.cu b/src/caffe/layers/eltwise_layer.cu deleted file mode 100644 index 3860944889c..00000000000 --- a/src/caffe/layers/eltwise_layer.cu +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -Dtype EltwiseLayer::Forward_gpu( - const vector*>& bottom, vector*>* top) { - const int count = (*top)[0]->count(); - Dtype* top_data = (*top)[0]->mutable_gpu_data(); - switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: - caffe_gpu_mul(count, bottom[0]->gpu_data(), - bottom[1]->gpu_data(), top_data); - for (int i = 2; i < bottom.size(); ++i) { - caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data); - } - break; - case EltwiseParameter_EltwiseOp_SUM: - caffe_gpu_set(count, Dtype(0.), top_data); - // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1? - for (int i = 0; i < bottom.size(); ++i) { - caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data); - } - break; - default: - LOG(FATAL) << "Unknown elementwise operation."; - } - return Dtype(0.); -} - -template -void EltwiseLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - const int count = top[0]->count(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - for (int i = 0; i < bottom->size(); ++i) { - if (propagate_down[i]) { - const Dtype* bottom_data = (*bottom)[i]->gpu_data(); - Dtype* bottom_diff = (*bottom)[i]->mutable_gpu_diff(); - switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: - caffe_gpu_div(count, top_data, bottom_data, bottom_diff); - caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); - break; - case EltwiseParameter_EltwiseOp_SUM: - if (coeffs_[i] == Dtype(1.)) { - caffe_gpu_copy(count, top_diff, bottom_diff); - } else { - caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff); - } - break; - default: - LOG(FATAL) << "Unknown elementwise operation."; - } - } - } -} - -INSTANTIATE_CLASS(EltwiseLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp index 2478a514cac..a88ec8ea3cc 100644 --- a/src/caffe/layers/euclidean_loss_layer.cpp +++ b/src/caffe/layers/euclidean_loss_layer.cpp @@ -20,34 +20,36 @@ void EuclideanLossLayer::FurtherSetUp( } template -Dtype EuclideanLossLayer::Forward_cpu(const vector*>& bottom, +Dtype EuclideanLossLayer::Forward(const vector*>& bottom, vector*>* top) { int count = bottom[0]->count(); - caffe_sub( + DeviceFactory::GetDevice()->sub( count, - bottom[0]->cpu_data(), - bottom[1]->cpu_data(), - diff_.mutable_cpu_data()); - Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data()); + bottom[0]->const_data(), + bottom[1]->const_data(), + diff_.mutable_data()); + Dtype dot; + DeviceFactory::GetDevice()->dot(count, diff_.const_data(), + diff_.const_data(), &dot); Dtype loss = dot / bottom[0]->num() / Dtype(2); if (top->size() == 1) { - (*top)[0]->mutable_cpu_data()[0] = loss; + (*top)[0]->mutable_data()[0] = loss; } return loss; } template -void EuclideanLossLayer::Backward_cpu(const vector*>& top, +void EuclideanLossLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { for (int i = 0; i < 2; ++i) { if (propagate_down[i]) { const Dtype sign = (i == 0) ? 1 : -1; - caffe_cpu_axpby( + DeviceFactory::GetDevice()->axpby( (*bottom)[i]->count(), // count sign / (*bottom)[i]->num(), // alpha - diff_.cpu_data(), // a + diff_.const_data(), // a Dtype(0), // beta - (*bottom)[i]->mutable_cpu_diff()); // b + (*bottom)[i]->mutable_diff()); // b } } } diff --git a/src/caffe/layers/euclidean_loss_layer.cu b/src/caffe/layers/euclidean_loss_layer.cu deleted file mode 100644 index b070ea96ff8..00000000000 --- a/src/caffe/layers/euclidean_loss_layer.cu +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/util/io.hpp" - -namespace caffe { - -template -Dtype EuclideanLossLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - int count = bottom[0]->count(); - caffe_gpu_sub( - count, - bottom[0]->gpu_data(), - bottom[1]->gpu_data(), - diff_.mutable_gpu_data()); - Dtype dot; - caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot); - Dtype loss = dot / bottom[0]->num() / Dtype(2); - return loss; -} - -template -void EuclideanLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - for (int i = 0; i < 2; ++i) { - if (propagate_down[i]) { - const Dtype sign = (i == 0) ? 1 : -1; - caffe_gpu_axpby( - (*bottom)[i]->count(), // count - sign / (*bottom)[i]->num(), // alpha - diff_.gpu_data(), // a - Dtype(0), // beta - (*bottom)[i]->mutable_gpu_diff()); // b - } - } -} - -INSTANTIATE_CLASS(EuclideanLossLayer); - -} // namespace caffe diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp index 9494da9a255..d9799db026f 100644 --- a/src/caffe/layers/flatten_layer.cpp +++ b/src/caffe/layers/flatten_layer.cpp @@ -21,14 +21,14 @@ void FlattenLayer::SetUp(const vector*>& bottom, } template -Dtype FlattenLayer::Forward_cpu(const vector*>& bottom, +Dtype FlattenLayer::Forward(const vector*>& bottom, vector*>* top) { (*top)[0]->ShareData(*bottom[0]); return Dtype(0.); } template -void FlattenLayer::Backward_cpu(const vector*>& top, +void FlattenLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { (*bottom)[0]->ShareDiff(*top[0]); } diff --git a/src/caffe/layers/flatten_layer.cu b/src/caffe/layers/flatten_layer.cu deleted file mode 100644 index 68add383c48..00000000000 --- a/src/caffe/layers/flatten_layer.cu +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -Dtype FlattenLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - (*top)[0]->ShareData(*bottom[0]); - return Dtype(0.); -} - -template -void FlattenLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - (*bottom)[0]->ShareDiff(*top[0]); -} - -INSTANTIATE_CLASS(FlattenLayer); - -} // namespace caffe diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index 2ba7fa77f45..02121512f16 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -86,7 +86,7 @@ void HDF5DataLayer::SetUp(const vector*>& bottom, } template -Dtype HDF5DataLayer::Forward_cpu(const vector*>& bottom, +Dtype HDF5DataLayer::Forward(const vector*>& bottom, vector*>* top) { const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); const int data_count = (*top)[0]->count() / (*top)[0]->num(); @@ -104,12 +104,13 @@ Dtype HDF5DataLayer::Forward_cpu(const vector*>& bottom, } current_row_ = 0; } - memcpy(&(*top)[0]->mutable_cpu_data()[i * data_count], - &data_blob_.cpu_data()[current_row_ * data_count], - sizeof(Dtype) * data_count); - memcpy(&(*top)[1]->mutable_cpu_data()[i * label_data_count], - &label_blob_.cpu_data()[current_row_ * label_data_count], - sizeof(Dtype) * label_data_count); + DeviceFactory::GetDevice()->copy_from_cpu( + data_count, &data_blob_.const_data()[current_row_ * data_count], + &(*top)[0]->mutable_data()[i * data_count]); + DeviceFactory::GetDevice()->copy_from_cpu( + label_data_count, + &label_blob_.const_data()[current_row_ * label_data_count], + &(*top)[1]->mutable_data()[i * label_data_count]); } return Dtype(0.); } diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu deleted file mode 100644 index b2b09ef7dd1..00000000000 --- a/src/caffe/layers/hdf5_data_layer.cu +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2014 BVLC and contributors. -/* -TODO: -- only load parts of the file, in accordance with a prototxt param "max_mem" -*/ - -#include -#include -#include - -#include "hdf5.h" -#include "hdf5_hl.h" - -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/vision_layers.hpp" - -using std::string; - -namespace caffe { - -template -Dtype HDF5DataLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); - const int data_count = (*top)[0]->count() / (*top)[0]->num(); - const int label_data_count = (*top)[1]->count() / (*top)[1]->num(); - - for (int i = 0; i < batch_size; ++i, ++current_row_) { - if (current_row_ == data_blob_.num()) { - if (num_files_ > 1) { - current_file_ += 1; - - if (current_file_ == num_files_) { - current_file_ = 0; - LOG(INFO) << "looping around to first file"; - } - - LoadHDF5FileData(hdf_filenames_[current_file_].c_str()); - } - current_row_ = 0; - } - CUDA_CHECK(cudaMemcpy( - &(*top)[0]->mutable_gpu_data()[i * data_count], - &data_blob_.cpu_data()[current_row_ * data_count], - sizeof(Dtype) * data_count, - cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy( - &(*top)[1]->mutable_gpu_data()[i * label_data_count], - &label_blob_.cpu_data()[current_row_ * label_data_count], - sizeof(Dtype) * label_data_count, - cudaMemcpyHostToDevice)); - } - return Dtype(0.); -} - -INSTANTIATE_CLASS(HDF5DataLayer); - -} // namespace caffe diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp index 3a513b9c366..cc87e777f81 100644 --- a/src/caffe/layers/hdf5_output_layer.cpp +++ b/src/caffe/layers/hdf5_output_layer.cpp @@ -42,7 +42,7 @@ void HDF5OutputLayer::SaveBlobs() { } template -Dtype HDF5OutputLayer::Forward_cpu(const vector*>& bottom, +Dtype HDF5OutputLayer::Forward(const vector*>& bottom, vector*>* top) { CHECK_GE(bottom.size(), 2); CHECK_EQ(bottom[0]->num(), bottom[1]->num()); @@ -54,23 +54,17 @@ Dtype HDF5OutputLayer::Forward_cpu(const vector*>& bottom, const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); for (int i = 0; i < bottom[0]->num(); ++i) { - memcpy(&data_blob_.mutable_cpu_data()[i * data_datum_dim], - &bottom[0]->cpu_data()[i * data_datum_dim], - sizeof(Dtype) * data_datum_dim); - memcpy(&label_blob_.mutable_cpu_data()[i * label_datum_dim], - &bottom[1]->cpu_data()[i * label_datum_dim], - sizeof(Dtype) * label_datum_dim); + DeviceFactory::GetDevice()->copy_from_cpu( + data_datum_dim, &bottom[0]->const_data()[i * data_datum_dim], + &data_blob_.mutable_data()[i * data_datum_dim]); + DeviceFactory::GetDevice()->copy_from_cpu( + label_datum_dim, &bottom[1]->const_data()[i * label_datum_dim], + &label_blob_.mutable_data()[i * label_datum_dim]); } SaveBlobs(); return Dtype(0.); } -template -void HDF5OutputLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - return; -} - INSTANTIATE_CLASS(HDF5OutputLayer); } // namespace caffe diff --git a/src/caffe/layers/hdf5_output_layer.cu b/src/caffe/layers/hdf5_output_layer.cu deleted file mode 100644 index 59505ee6acf..00000000000 --- a/src/caffe/layers/hdf5_output_layer.cu +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include - -#include "hdf5.h" -#include "hdf5_hl.h" - -#include "caffe/blob.hpp" -#include "caffe/common.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { -using std::vector; - -template -Dtype HDF5OutputLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - CHECK_GE(bottom.size(), 2); - CHECK_EQ(bottom[0]->num(), bottom[1]->num()); - data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); - label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(), - bottom[1]->height(), bottom[1]->width()); - const int data_datum_dim = bottom[0]->count() / bottom[0]->num(); - const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); - - for (int i = 0; i < bottom[0]->num(); ++i) { - CUDA_CHECK(cudaMemcpy(&data_blob_.mutable_cpu_data()[i * data_datum_dim], - &bottom[0]->gpu_data()[i * data_datum_dim], - sizeof(Dtype) * data_datum_dim, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(&label_blob_.mutable_cpu_data()[i * label_datum_dim], - &bottom[1]->gpu_data()[i * label_datum_dim], - sizeof(Dtype) * label_datum_dim, cudaMemcpyDeviceToHost)); - } - SaveBlobs(); - return Dtype(0.); -} - -template -void HDF5OutputLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - return; -} - -INSTANTIATE_CLASS(HDF5OutputLayer); - -} // namespace caffe diff --git a/src/caffe/layers/hinge_loss_layer.cpp b/src/caffe/layers/hinge_loss_layer.cpp index 8097761d22b..221b4449360 100644 --- a/src/caffe/layers/hinge_loss_layer.cpp +++ b/src/caffe/layers/hinge_loss_layer.cpp @@ -15,16 +15,16 @@ using std::max; namespace caffe { template -Dtype HingeLossLayer::Forward_cpu(const vector*>& bottom, +Dtype HingeLossLayer::Forward(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const Dtype* label = bottom[1]->cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* bottom_diff = bottom[0]->mutable_diff(); + const Dtype* label = bottom[1]->const_data(); int num = bottom[0]->num(); int count = bottom[0]->count(); int dim = count / num; - caffe_copy(count, bottom_data, bottom_diff); + DeviceFactory::GetDevice()->copy(count, bottom_data, bottom_diff); for (int i = 0; i < num; ++i) { bottom_diff[i * dim + static_cast(label[i])] *= -1; } @@ -35,24 +35,29 @@ Dtype HingeLossLayer::Forward_cpu(const vector*>& bottom, } switch (this->layer_param_.hinge_loss_param().norm()) { case HingeLossParameter_Norm_L1: - return caffe_cpu_asum(count, bottom_diff) / num; + Dtype sum; + DeviceFactory::GetDevice()->asum(count, bottom_diff, &sum); + return sum / num; case HingeLossParameter_Norm_L2: - return caffe_cpu_dot(count, bottom_diff, bottom_diff) / num; + Dtype dot; + DeviceFactory::GetDevice()->dot(count, bottom_diff, + bottom_diff, &dot); + return dot / num; default: LOG(FATAL) << "Unknown Norm"; } } template -void HingeLossLayer::Backward_cpu(const vector*>& top, +void HingeLossLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[1]) { LOG(FATAL) << this->type_name() << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); - const Dtype* label = (*bottom)[1]->cpu_data(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); + const Dtype* label = (*bottom)[1]->const_data(); int num = (*bottom)[0]->num(); int count = (*bottom)[0]->count(); int dim = count / num; @@ -63,11 +68,13 @@ void HingeLossLayer::Backward_cpu(const vector*>& top, switch (this->layer_param_.hinge_loss_param().norm()) { case HingeLossParameter_Norm_L1: - caffe_cpu_sign(count, bottom_diff, bottom_diff); - caffe_scal(count, Dtype(1. / num), bottom_diff); + DeviceFactory::GetDevice()->sign(count, bottom_diff, bottom_diff); + DeviceFactory::GetDevice()->scal(count, Dtype(1. / num), + bottom_diff); break; case HingeLossParameter_Norm_L2: - caffe_scal(count, Dtype(2. / num), bottom_diff); + DeviceFactory::GetDevice()->scal(count, Dtype(2. / num), + bottom_diff); break; default: LOG(FATAL) << "Unknown Norm"; diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp index e047dfb80a7..13048829796 100644 --- a/src/caffe/layers/im2col_layer.cpp +++ b/src/caffe/layers/im2col_layer.cpp @@ -25,24 +25,26 @@ void Im2colLayer::SetUp(const vector*>& bottom, } template -Dtype Im2colLayer::Forward_cpu(const vector*>& bottom, +Dtype Im2colLayer::Forward(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); for (int n = 0; n < bottom[0]->num(); ++n) { - im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, + DeviceFactory::GetDevice()->im2col( + bottom_data + bottom[0]->offset(n), channels_, height_, width_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n)); } return Dtype(0.); } template -void Im2colLayer::Backward_cpu(const vector*>& top, +void Im2colLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + const Dtype* top_diff = top[0]->const_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); for (int n = 0; n < top[0]->num(); ++n) { - col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_, + DeviceFactory::GetDevice()->col2im( + top_diff + top[0]->offset(n), channels_, height_, width_, kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n)); } } diff --git a/src/caffe/layers/im2col_layer.cu b/src/caffe/layers/im2col_layer.cu deleted file mode 100644 index 9cfb74e815c..00000000000 --- a/src/caffe/layers/im2col_layer.cu +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include - -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/common.hpp" - -namespace caffe { - -template -Dtype Im2colLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = (*top)[0]->mutable_gpu_data(); - for (int n = 0; n < bottom[0]->num(); ++n) { - im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_, - width_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n)); - } - return Dtype(0.); -} - -template -void Im2colLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff(); - for (int n = 0; n < top[0]->num(); ++n) { - col2im_gpu(top_diff + top[0]->offset(n), channels_, height_, width_, - kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n)); - } -} - - -INSTANTIATE_CLASS(Im2colLayer); - -} // namespace caffe diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp index 1f7368e7e4d..a890090efa6 100644 --- a/src/caffe/layers/image_data_layer.cpp +++ b/src/caffe/layers/image_data_layer.cpp @@ -30,8 +30,8 @@ void* ImageDataLayerPrefetch(void* layer_pointer) { CHECK(layer); Datum datum; CHECK(layer->prefetch_data_); - Dtype* top_data = layer->prefetch_data_->mutable_cpu_data(); - Dtype* top_label = layer->prefetch_label_->mutable_cpu_data(); + Dtype* top_data = layer->prefetch_data_->mutable_data(); + Dtype* top_label = layer->prefetch_label_->mutable_data(); ImageDataParameter image_data_param = layer->layer_param_.image_data_param(); const Dtype scale = image_data_param.scale(); const int batch_size = image_data_param.batch_size(); @@ -50,7 +50,7 @@ void* ImageDataLayerPrefetch(void* layer_pointer) { const int width = layer->datum_width_; const int size = layer->datum_size_; const int lines_size = layer->lines_.size(); - const Dtype* mean = layer->data_mean_.cpu_data(); + const Dtype* mean = layer->data_mean_.const_data(); for (int item_id = 0; item_id < batch_size; ++item_id) { // get a blob CHECK_GT(lines_size, layer->lines_id_); @@ -220,12 +220,12 @@ void ImageDataLayer::SetUp(const vector*>& bottom, data_mean_.Reshape(1, datum_channels_, datum_height_, datum_width_); } // Now, start the prefetch thread. Before calling prefetch, we make two - // cpu_data calls so that the prefetch thread does not accidentally make + // const_data calls so that the prefetch thread does not accidentally make // simultaneous cudaMalloc calls when the main thread is running. In some // GPUs this seems to cause failures if we do not so. - prefetch_data_->mutable_cpu_data(); - prefetch_label_->mutable_cpu_data(); - data_mean_.cpu_data(); + prefetch_data_->mutable_data(); + prefetch_label_->mutable_data(); + data_mean_.const_data(); DLOG(INFO) << "Initializing prefetch"; CreatePrefetchThread(); DLOG(INFO) << "Prefetch initialized."; @@ -273,15 +273,17 @@ unsigned int ImageDataLayer::PrefetchRand() { } template -Dtype ImageDataLayer::Forward_cpu(const vector*>& bottom, +Dtype ImageDataLayer::Forward(const vector*>& bottom, vector*>* top) { // First, join the thread JoinPrefetchThread(); // Copy the data - caffe_copy(prefetch_data_->count(), prefetch_data_->cpu_data(), - (*top)[0]->mutable_cpu_data()); - caffe_copy(prefetch_label_->count(), prefetch_label_->cpu_data(), - (*top)[1]->mutable_cpu_data()); + DeviceFactory::GetDevice()->copy_from_cpu( + prefetch_data_->count(), prefetch_data_->const_data(), + (*top)[0]->mutable_data()); + DeviceFactory::GetDevice()->copy_from_cpu( + prefetch_label_->count(), prefetch_label_->const_data(), + (*top)[1]->mutable_data()); // Start a new prefetch thread CreatePrefetchThread(); return Dtype(0.); diff --git a/src/caffe/layers/image_data_layer.cu b/src/caffe/layers/image_data_layer.cu deleted file mode 100644 index 98047297d80..00000000000 --- a/src/caffe/layers/image_data_layer.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include -#include -#include -#include - -#include -#include -#include // NOLINT(readability/streams) -#include // NOLINT(readability/streams) - -#include "caffe/blob.hpp" -#include "caffe/common.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/vision_layers.hpp" - -using std::string; -using std::pair; - -namespace caffe { - -template -Dtype ImageDataLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - // First, join the thread - JoinPrefetchThread(); - // Copy the data - CUDA_CHECK(cudaMemcpy((*top)[0]->mutable_gpu_data(), - prefetch_data_->cpu_data(), sizeof(Dtype) * prefetch_data_->count(), - cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy((*top)[1]->mutable_gpu_data(), - prefetch_label_->cpu_data(), sizeof(Dtype) * prefetch_label_->count(), - cudaMemcpyHostToDevice)); - // Start a new prefetch thread - CreatePrefetchThread(); - return Dtype(0.); -} - -INSTANTIATE_CLASS(ImageDataLayer); - -} // namespace caffe diff --git a/src/caffe/layers/infogain_loss_layer.cpp b/src/caffe/layers/infogain_loss_layer.cpp index a72874e4bb4..8a48d18f498 100644 --- a/src/caffe/layers/infogain_loss_layer.cpp +++ b/src/caffe/layers/infogain_loss_layer.cpp @@ -32,11 +32,11 @@ void InfogainLossLayer::FurtherSetUp( template -Dtype InfogainLossLayer::Forward_cpu(const vector*>& bottom, +Dtype InfogainLossLayer::Forward(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* bottom_label = bottom[1]->cpu_data(); - const Dtype* infogain_mat = infogain_.cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + const Dtype* bottom_label = bottom[1]->const_data(); + const Dtype* infogain_mat = infogain_.const_data(); int num = bottom[0]->num(); int dim = bottom[0]->count() / bottom[0]->num(); CHECK_EQ(infogain_.height(), dim); @@ -49,13 +49,13 @@ Dtype InfogainLossLayer::Forward_cpu(const vector*>& bottom, } } if (top->size() == 1) { - (*top)[0]->mutable_cpu_data()[0] = loss / num; + (*top)[0]->mutable_data()[0] = loss / num; } return loss / num; } template -void InfogainLossLayer::Backward_cpu(const vector*>& top, +void InfogainLossLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[1]) { @@ -63,10 +63,10 @@ void InfogainLossLayer::Backward_cpu(const vector*>& top, << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { - const Dtype* bottom_data = (*bottom)[0]->cpu_data(); - const Dtype* bottom_label = (*bottom)[1]->cpu_data(); - const Dtype* infogain_mat = infogain_.cpu_data(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + const Dtype* bottom_data = (*bottom)[0]->const_data(); + const Dtype* bottom_label = (*bottom)[1]->const_data(); + const Dtype* infogain_mat = infogain_.const_data(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); int num = (*bottom)[0]->num(); int dim = (*bottom)[0]->count() / (*bottom)[0]->num(); CHECK_EQ(infogain_.height(), dim); diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index ddf55e49b63..08278c538da 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -49,7 +49,7 @@ void InnerProductLayer::SetUp(const vector*>& bottom, if (bias_term_) { bias_multiplier_.reset(new SyncedMemory(M_ * sizeof(Dtype))); Dtype* bias_multiplier_data = - reinterpret_cast(bias_multiplier_->mutable_cpu_data()); + reinterpret_cast(bias_multiplier_->mutable_data()); for (int i = 0; i < M_; ++i) { bias_multiplier_data[i] = 1.; } @@ -57,41 +57,46 @@ void InnerProductLayer::SetUp(const vector*>& bottom, } template -Dtype InnerProductLayer::Forward_cpu(const vector*>& bottom, +Dtype InnerProductLayer::Forward(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); - const Dtype* weight = this->blobs_[0]->cpu_data(); - caffe_cpu_gemm(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1., + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); + const Dtype* weight = this->blobs_[0]->const_data(); + DeviceFactory::GetDevice()->gemm( + CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1., bottom_data, weight, (Dtype)0., top_data); if (bias_term_) { - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1., - reinterpret_cast(bias_multiplier_->cpu_data()), - this->blobs_[1]->cpu_data(), (Dtype)1., top_data); + DeviceFactory::GetDevice()->gemm( + CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1., + reinterpret_cast(bias_multiplier_->const_data()), + this->blobs_[1]->const_data(), (Dtype)1., top_data); } return Dtype(0); } template -void InnerProductLayer::Backward_cpu(const vector*>& top, +void InnerProductLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - const Dtype* bottom_data = (*bottom)[0]->cpu_data(); + const Dtype* top_diff = top[0]->const_diff(); + const Dtype* bottom_data = (*bottom)[0]->const_data(); // Gradient with respect to weight - caffe_cpu_gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., - top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_cpu_diff()); + DeviceFactory::GetDevice()->gemm( + CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., + top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_diff()); if (bias_term_) { // Gradient with respect to bias - caffe_cpu_gemv(CblasTrans, M_, N_, (Dtype)1., top_diff, - reinterpret_cast(bias_multiplier_->cpu_data()), (Dtype)0., - this->blobs_[1]->mutable_cpu_diff()); + DeviceFactory::GetDevice()->gemv( + CblasTrans, M_, N_, (Dtype)1., top_diff, + reinterpret_cast(bias_multiplier_->const_data()), + (Dtype)0., this->blobs_[1]->mutable_diff()); } if (propagate_down[0]) { // Gradient with respect to bottom data - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., - top_diff, this->blobs_[0]->cpu_data(), (Dtype)0., - (*bottom)[0]->mutable_cpu_diff()); + DeviceFactory::GetDevice()->gemm( + CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., + top_diff, this->blobs_[0]->const_data(), (Dtype)0., + (*bottom)[0]->mutable_diff()); } } diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/inner_product_layer.cu deleted file mode 100644 index 5b95a57b23b..00000000000 --- a/src/caffe/layers/inner_product_layer.cu +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include - -#include - -#include "caffe/blob.hpp" -#include "caffe/common.hpp" -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -Dtype InnerProductLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = (*top)[0]->mutable_gpu_data(); - const Dtype* weight = this->blobs_[0]->gpu_data(); - caffe_gpu_gemm(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1., - bottom_data, weight, (Dtype)0., top_data); - if (bias_term_) { - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1., - reinterpret_cast(bias_multiplier_->gpu_data()), - this->blobs_[1]->gpu_data(), (Dtype)1., top_data); - } - return Dtype(0); -} - -template -void InnerProductLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - vector*>* bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = (*bottom)[0]->gpu_data(); - // Gradient with respect to weight - caffe_gpu_gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., - top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_gpu_diff()); - if (bias_term_) { - // Gradient with respect to bias - caffe_gpu_gemv(CblasTrans, M_, N_, (Dtype)1., top_diff, - reinterpret_cast(bias_multiplier_->gpu_data()), - (Dtype)0., this->blobs_[1]->mutable_gpu_diff()); - } - if (propagate_down[0]) { - // Gradient with respect to bottom data - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., - top_diff, this->blobs_[0]->gpu_data(), (Dtype)0., - (*bottom)[0]->mutable_gpu_diff()); - } -} - -INSTANTIATE_CLASS(InnerProductLayer); - -} // namespace caffe diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp index a86c1d4c59d..7472e60860f 100644 --- a/src/caffe/layers/lrn_layer.cpp +++ b/src/caffe/layers/lrn_layer.cpp @@ -114,15 +114,15 @@ Dtype LRNLayer::Forward_cpu(const vector*>& bottom, template Dtype LRNLayer::CrossChannelForward_cpu( const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); - Dtype* scale_data = scale_.mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); + Dtype* scale_data = scale_.mutable_data(); // start with the constant value for (int i = 0; i < scale_.count(); ++i) { scale_data[i] = 1.; } Blob padded_square(1, channels_ + size_ - 1, height_, width_); - Dtype* padded_square_data = padded_square.mutable_cpu_data(); + Dtype* padded_square_data = padded_square.mutable_data(); memset(padded_square_data, 0, sizeof(Dtype) * padded_square.count()); Dtype alpha_over_size = alpha_ / size_; // go through the images @@ -190,17 +190,17 @@ template void LRNLayer::CrossChannelBackward_cpu( const vector*>& top, const vector& propagate_down, vector*>* bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* bottom_data = (*bottom)[0]->cpu_data(); - const Dtype* scale_data = scale_.cpu_data(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + const Dtype* top_diff = top[0]->const_diff(); + const Dtype* top_data = top[0]->const_data(); + const Dtype* bottom_data = (*bottom)[0]->const_data(); + const Dtype* scale_data = scale_.const_data(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); Blob padded_ratio(1, channels_ + size_ - 1, height_, width_); Blob accum_ratio(1, 1, height_, width_); - Dtype* padded_ratio_data = padded_ratio.mutable_cpu_data(); - Dtype* accum_ratio_data = accum_ratio.mutable_cpu_data(); + Dtype* padded_ratio_data = padded_ratio.mutable_data(); + Dtype* accum_ratio_data = accum_ratio.mutable_data(); // We hack a little bit by using the diff() to store an additional result - Dtype* accum_ratio_times_bottom = accum_ratio.mutable_cpu_diff(); + Dtype* accum_ratio_times_bottom = accum_ratio.mutable_diff(); memset(padded_ratio_data, 0, sizeof(Dtype) * padded_ratio.count()); Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_; diff --git a/src/caffe/layers/memory_data_layer.cpp b/src/caffe/layers/memory_data_layer.cpp index 15eedb317e3..cb8f1d57626 100644 --- a/src/caffe/layers/memory_data_layer.cpp +++ b/src/caffe/layers/memory_data_layer.cpp @@ -36,7 +36,7 @@ void MemoryDataLayer::Reset(Dtype* data, Dtype* labels, int n) { } template -Dtype MemoryDataLayer::Forward_cpu(const vector*>& bottom, +Dtype MemoryDataLayer::Forward(const vector*>& bottom, vector*>* top) { CHECK(data_) << "MemoryDataLayer needs to be initalized by calling Reset"; (*top)[0]->set_cpu_data(data_ + pos_ * datum_size_); diff --git a/src/caffe/layers/multinomial_logistic_loss_layer.cpp b/src/caffe/layers/multinomial_logistic_loss_layer.cpp index 013d4034240..763ba9f9d80 100644 --- a/src/caffe/layers/multinomial_logistic_loss_layer.cpp +++ b/src/caffe/layers/multinomial_logistic_loss_layer.cpp @@ -23,10 +23,10 @@ void MultinomialLogisticLossLayer::FurtherSetUp( } template -Dtype MultinomialLogisticLossLayer::Forward_cpu( +Dtype MultinomialLogisticLossLayer::Forward( const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* bottom_label = bottom[1]->cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + const Dtype* bottom_label = bottom[1]->const_data(); int num = bottom[0]->num(); int dim = bottom[0]->count() / bottom[0]->num(); Dtype loss = 0; @@ -36,13 +36,13 @@ Dtype MultinomialLogisticLossLayer::Forward_cpu( loss -= log(prob); } if (top->size() == 1){ - (*top)[0]->mutable_cpu_data()[0] = loss / num; + (*top)[0]->mutable_data()[0] = loss / num; } return loss / num; } template -void MultinomialLogisticLossLayer::Backward_cpu( +void MultinomialLogisticLossLayer::Backward( const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[1]) { @@ -50,9 +50,9 @@ void MultinomialLogisticLossLayer::Backward_cpu( << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { - const Dtype* bottom_data = (*bottom)[0]->cpu_data(); - const Dtype* bottom_label = (*bottom)[1]->cpu_data(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + const Dtype* bottom_data = (*bottom)[0]->const_data(); + const Dtype* bottom_label = (*bottom)[1]->const_data(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); int num = (*bottom)[0]->num(); int dim = (*bottom)[0]->count() / (*bottom)[0]->num(); memset(bottom_diff, 0, sizeof(Dtype) * (*bottom)[0]->count()); diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp index bc002078814..303a5ef4e0a 100644 --- a/src/caffe/layers/pooling_layer.cpp +++ b/src/caffe/layers/pooling_layer.cpp @@ -82,8 +82,8 @@ void PoolingLayer::SetUp(const vector*>& bottom, template Dtype PoolingLayer::Forward_cpu(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); const int top_count = (*top)[0]->count(); // We'll output the mask to top[1] if it's of size >1. const bool use_top_mask = top->size() > 1; @@ -95,10 +95,10 @@ Dtype PoolingLayer::Forward_cpu(const vector*>& bottom, case PoolingParameter_PoolMethod_MAX: // Initialize if (use_top_mask) { - top_mask = (*top)[1]->mutable_cpu_data(); + top_mask = (*top)[1]->mutable_data(); caffe_set(top_count, Dtype(-1), top_mask); } else { - mask = max_idx_->mutable_cpu_data(); + mask = max_idx_->mutable_data(); caffe_set(top_count, -1, mask); } caffe_set(top_count, Dtype(-FLT_MAX), top_data); @@ -188,8 +188,8 @@ void PoolingLayer::Backward_cpu(const vector*>& top, if (!propagate_down[0]) { return; } - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + const Dtype* top_diff = top[0]->const_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); // Different pooling methods. We explicitly do the switch outside the for // loop to save time, although this results in more codes. caffe_set((*bottom)[0]->count(), Dtype(0), bottom_diff); @@ -201,9 +201,9 @@ void PoolingLayer::Backward_cpu(const vector*>& top, case PoolingParameter_PoolMethod_MAX: // The main loop if (use_top_mask) { - top_mask = top[1]->cpu_data(); + top_mask = top[1]->const_data(); } else { - mask = max_idx_->cpu_data(); + mask = max_idx_->const_data(); } for (int n = 0; n < top[0]->num(); ++n) { for (int c = 0; c < channels_; ++c) { diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp index 5ff3392968e..766214ecf01 100644 --- a/src/caffe/layers/power_layer.cpp +++ b/src/caffe/layers/power_layer.cpp @@ -23,78 +23,86 @@ void PowerLayer::SetUp(const vector*>& bottom, // Compute y = (shift + scale * x)^power template -Dtype PowerLayer::Forward_cpu(const vector*>& bottom, +Dtype PowerLayer::Forward(const vector*>& bottom, vector*>* top) { - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + Dtype* top_data = (*top)[0]->mutable_data(); const int count = bottom[0]->count(); // Special case where we can ignore the input: scale or power is 0. if (diff_scale_ == Dtype(0)) { Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); - caffe_set(count, value, top_data); + DeviceFactory::GetDevice()->set(count, value, top_data); return Dtype(0); } - const Dtype* bottom_data = bottom[0]->cpu_data(); - caffe_copy(count, bottom_data, top_data); + const Dtype* bottom_data = bottom[0]->const_data(); + DeviceFactory::GetDevice()->copy(count, bottom_data, top_data); if (scale_ != Dtype(1)) { - caffe_scal(count, scale_, top_data); + DeviceFactory::GetDevice()->scal(count, scale_, top_data); } if (shift_ != Dtype(0)) { - caffe_add_scalar(count, shift_, top_data); + DeviceFactory::GetDevice()->add_scalar(count, shift_, top_data); } if (power_ != Dtype(1)) { - caffe_powx(count, top_data, power_, top_data); + DeviceFactory::GetDevice()->powx(count, top_data, power_, top_data); } return Dtype(0); } template -void PowerLayer::Backward_cpu(const vector*>& top, +void PowerLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[0]) { - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); const int count = (*bottom)[0]->count(); - const Dtype* top_diff = top[0]->cpu_diff(); + const Dtype* top_diff = top[0]->const_diff(); if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { - caffe_set(count, diff_scale_, bottom_diff); + DeviceFactory::GetDevice()->set(count, diff_scale_, bottom_diff); } else { - const Dtype* bottom_data = (*bottom)[0]->cpu_data(); + const Dtype* bottom_data = (*bottom)[0]->const_data(); // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) // = diff_scale * y / (shift + scale * x) if (power_ == Dtype(2)) { // Special case for y = (shift + scale * x)^2 // -> dy/dx = 2 * scale * (shift + scale * x) // = diff_scale * shift + diff_scale * scale * x - caffe_cpu_axpby(count, diff_scale_ * scale_, bottom_data, + DeviceFactory::GetDevice()->axpby( + count, diff_scale_ * scale_, bottom_data, Dtype(0), bottom_diff); if (shift_ != Dtype(0)) { - caffe_add_scalar(count, diff_scale_ * shift_, bottom_diff); + DeviceFactory::GetDevice()->add_scalar( + count, diff_scale_ * shift_, bottom_diff); } } else if (shift_ == Dtype(0)) { // Special case for y = (scale * x)^power // -> dy/dx = scale * power * (scale * x)^(power - 1) // = scale * power * (scale * x)^power * (scale * x)^(-1) // = power * y / x - const Dtype* top_data = top[0]->cpu_data(); - caffe_div(count, top_data, bottom_data, bottom_diff); - caffe_scal(count, power_, bottom_diff); + const Dtype* top_data = top[0]->const_data(); + DeviceFactory::GetDevice()->div( + count, top_data, bottom_data, bottom_diff); + DeviceFactory::GetDevice()->scal(count, power_, bottom_diff); } else { - caffe_copy(count, bottom_data, bottom_diff); + DeviceFactory::GetDevice()->copy(count, bottom_data, + bottom_diff); if (scale_ != Dtype(1)) { - caffe_scal(count, scale_, bottom_diff); + DeviceFactory::GetDevice()->scal(count, scale_, bottom_diff); } if (shift_ != Dtype(0)) { - caffe_add_scalar(count, shift_, bottom_diff); + DeviceFactory::GetDevice()->add_scalar(count, shift_, + bottom_diff); } - const Dtype* top_data = top[0]->cpu_data(); - caffe_div(count, top_data, bottom_diff, bottom_diff); + const Dtype* top_data = top[0]->const_data(); + DeviceFactory::GetDevice()->div(count, top_data, bottom_diff, + bottom_diff); if (diff_scale_ != Dtype(1)) { - caffe_scal(count, diff_scale_, bottom_diff); + DeviceFactory::GetDevice()->scal(count, diff_scale_, + bottom_diff); } } } if (diff_scale_ != Dtype(0)) { - caffe_mul(count, top_diff, bottom_diff, bottom_diff); + DeviceFactory::GetDevice()->mul(count, top_diff, bottom_diff, + bottom_diff); } } } diff --git a/src/caffe/layers/power_layer.cu b/src/caffe/layers/power_layer.cu deleted file mode 100644 index 6d699636e21..00000000000 --- a/src/caffe/layers/power_layer.cu +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" - -using std::max; - -namespace caffe { - -template -Dtype PowerLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - Dtype* top_data = (*top)[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // Special case where we can ignore the input: scale or power is 0. - if (diff_scale_ == Dtype(0)) { - Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); - caffe_gpu_set(count, value, top_data); - return Dtype(0); - } - const Dtype* bottom_data = bottom[0]->gpu_data(); - caffe_gpu_copy(count, bottom_data, top_data); - if (scale_ != Dtype(1)) { - caffe_gpu_scal(count, scale_, top_data); - } - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, shift_, top_data); - } - if (power_ != Dtype(1)) { - caffe_gpu_powx(count, top_data, power_, top_data); - } - return Dtype(0); -} - -template -void PowerLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - vector*>* bottom) { - if (propagate_down[0]) { - Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff(); - const int count = (*bottom)[0]->count(); - const Dtype* top_diff = top[0]->gpu_diff(); - if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { - caffe_gpu_set(count, diff_scale_, bottom_diff); - } else { - const Dtype* bottom_data = (*bottom)[0]->gpu_data(); - // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) - // = diff_scale * y / (shift + scale * x) - if (power_ == Dtype(2)) { - // Special case for y = (shift + scale * x)^2 - // -> dy/dx = 2 * scale * (shift + scale * x) - // = diff_scale * shift + diff_scale * scale * x - caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data, - Dtype(0), bottom_diff); - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff); - } - } else if (shift_ == Dtype(0)) { - // Special case for y = (scale * x)^power - // -> dy/dx = scale * power * (scale * x)^(power - 1) - // = scale * power * (scale * x)^power * (scale * x)^(-1) - // = power * y / x - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_div(count, top_data, bottom_data, bottom_diff); - caffe_gpu_scal(count, power_, bottom_diff); - } else { - caffe_gpu_copy(count, bottom_data, bottom_diff); - if (scale_ != Dtype(1)) { - caffe_gpu_scal(count, scale_, bottom_diff); - } - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, shift_, bottom_diff); - } - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_div(count, top_data, bottom_diff, bottom_diff); - if (diff_scale_ != Dtype(1)) { - caffe_gpu_scal(count, diff_scale_, bottom_diff); - } - } - } - caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); - } -} - -INSTANTIATE_CLASS(PowerLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp index d7a8509b247..6efda981228 100644 --- a/src/caffe/layers/relu_layer.cpp +++ b/src/caffe/layers/relu_layer.cpp @@ -13,8 +13,8 @@ namespace caffe { template Dtype ReLULayer::Forward_cpu(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); const int count = bottom[0]->count(); for (int i = 0; i < count; ++i) { top_data[i] = max(bottom_data[i], Dtype(0)); @@ -27,9 +27,9 @@ void ReLULayer::Backward_cpu(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[0]) { - const Dtype* bottom_data = (*bottom)[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + const Dtype* bottom_data = (*bottom)[0]->const_data(); + const Dtype* top_diff = top[0]->const_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); const int count = (*bottom)[0]->count(); for (int i = 0; i < count; ++i) { bottom_diff[i] = top_diff[i] * (bottom_data[i] > 0); diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp index 8cb830ff248..fcc3a19a14d 100644 --- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp +++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp @@ -25,7 +25,7 @@ void SigmoidCrossEntropyLossLayer::FurtherSetUp( } template -Dtype SigmoidCrossEntropyLossLayer::Forward_cpu( +Dtype SigmoidCrossEntropyLossLayer::Forward( const vector*>& bottom, vector*>* top) { // The forward pass computes the sigmoid outputs. sigmoid_bottom_vec_[0] = bottom[0]; @@ -34,21 +34,21 @@ Dtype SigmoidCrossEntropyLossLayer::Forward_cpu( const int count = bottom[0]->count(); const int num = bottom[0]->num(); // Stable version of loss computation from input data - const Dtype* input_data = bottom[0]->cpu_data(); - const Dtype* target = bottom[1]->cpu_data(); + const Dtype* input_data = bottom[0]->const_data(); + const Dtype* target = bottom[1]->const_data(); Dtype loss = 0; for (int i = 0; i < count; ++i) { loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) - log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0))); } if (top->size() == 1) { - (*top)[0]->mutable_cpu_data()[0] = loss / num; + (*top)[0]->mutable_data()[0] = loss / num; } return loss / num; } template -void SigmoidCrossEntropyLossLayer::Backward_cpu( +void SigmoidCrossEntropyLossLayer::Backward( const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[1]) { @@ -59,12 +59,14 @@ void SigmoidCrossEntropyLossLayer::Backward_cpu( // First, compute the diff const int count = (*bottom)[0]->count(); const int num = (*bottom)[0]->num(); - const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data(); - const Dtype* target = (*bottom)[1]->cpu_data(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); - caffe_sub(count, sigmoid_output_data, target, bottom_diff); + const Dtype* sigmoid_output_data = sigmoid_output_->const_data(); + const Dtype* target = (*bottom)[1]->const_data(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); + DeviceFactory::GetDevice()->sub( + count, sigmoid_output_data, target, bottom_diff); // Scale down gradient - caffe_scal(count, Dtype(1) / num, bottom_diff); + DeviceFactory::GetDevice()->scal(count, Dtype(1) / num, + bottom_diff); } } diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu deleted file mode 100644 index 8f7275827e2..00000000000 --- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" - -using std::max; - -namespace caffe { - -template -Dtype SigmoidCrossEntropyLossLayer::Forward_gpu( - const vector*>& bottom, vector*>* top) { - // The forward pass computes the sigmoid outputs. - sigmoid_bottom_vec_[0] = bottom[0]; - sigmoid_layer_->Forward(sigmoid_bottom_vec_, &sigmoid_top_vec_); - // Compute the loss (negative log likelihood) - const int count = bottom[0]->count(); - const int num = bottom[0]->num(); - // Stable version of loss computation from input data - const Dtype* input_data = bottom[0]->cpu_data(); - const Dtype* target = bottom[1]->cpu_data(); - Dtype loss = 0; - for (int i = 0; i < count; ++i) { - loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) - - log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0))); - } - if (top->size() == 1) { - (*top)[0]->mutable_cpu_data()[0] = loss / num; - } - return loss / num; -} - -template -void SigmoidCrossEntropyLossLayer::Backward_gpu( - const vector*>& top, const vector& propagate_down, - vector*>* bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type_name() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - // First, compute the diff - const int count = (*bottom)[0]->count(); - const int num = (*bottom)[0]->num(); - const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data(); - const Dtype* target = (*bottom)[1]->gpu_data(); - Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff(); - caffe_gpu_copy(count, sigmoid_output_data, bottom_diff); - caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff); - // Scale down gradient - caffe_gpu_scal(count, Dtype(1) / num, bottom_diff); - } -} - -INSTANTIATE_CLASS(SigmoidCrossEntropyLossLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp index 50139d863dd..8d8afd3e356 100644 --- a/src/caffe/layers/sigmoid_layer.cpp +++ b/src/caffe/layers/sigmoid_layer.cpp @@ -17,8 +17,8 @@ inline Dtype sigmoid(Dtype x) { template Dtype SigmoidLayer::Forward_cpu(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); const int count = bottom[0]->count(); for (int i = 0; i < count; ++i) { top_data[i] = sigmoid(bottom_data[i]); @@ -31,9 +31,9 @@ void SigmoidLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[0]) { - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + const Dtype* top_data = top[0]->const_data(); + const Dtype* top_diff = top[0]->const_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); const int count = (*bottom)[0]->count(); for (int i = 0; i < count; ++i) { const Dtype sigmoid_x = top_data[i]; diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp index 57847d005f6..14548c94c4d 100644 --- a/src/caffe/layers/softmax_layer.cpp +++ b/src/caffe/layers/softmax_layer.cpp @@ -19,7 +19,7 @@ void SoftmaxLayer::SetUp(const vector*>& bottom, bottom[0]->height(), bottom[0]->width()); sum_multiplier_.Reshape(1, bottom[0]->channels(), bottom[0]->height(), bottom[0]->width()); - Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data(); + Dtype* multiplier_data = sum_multiplier_.mutable_data(); for (int i = 0; i < sum_multiplier_.count(); ++i) { multiplier_data[i] = 1.; } @@ -29,9 +29,9 @@ void SoftmaxLayer::SetUp(const vector*>& bottom, template Dtype SoftmaxLayer::Forward_cpu(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); - Dtype* scale_data = scale_.mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); + Dtype* scale_data = scale_.mutable_data(); int num = bottom[0]->num(); int dim = bottom[0]->count() / bottom[0]->num(); memcpy(top_data, bottom_data, sizeof(Dtype) * bottom[0]->count()); @@ -45,12 +45,12 @@ Dtype SoftmaxLayer::Forward_cpu(const vector*>& bottom, } // subtraction caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - scale_data, sum_multiplier_.cpu_data(), 1., top_data); + scale_data, sum_multiplier_.const_data(), 1., top_data); // Perform exponentiation caffe_exp(num * dim, top_data, top_data); // sum after exp caffe_cpu_gemv(CblasNoTrans, num, dim, 1., top_data, - sum_multiplier_.cpu_data(), 0., scale_data); + sum_multiplier_.const_data(), 0., scale_data); // Do division for (int i = 0; i < num; ++i) { caffe_scal(dim, Dtype(1.) / scale_data[i], top_data + i * dim); @@ -62,10 +62,10 @@ template void SoftmaxLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, vector*>* bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - const Dtype* top_data = top[0]->cpu_data(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); - Dtype* scale_data = scale_.mutable_cpu_data(); + const Dtype* top_diff = top[0]->const_diff(); + const Dtype* top_data = top[0]->const_data(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); + Dtype* scale_data = scale_.mutable_data(); int num = top[0]->num(); int dim = top[0]->count() / top[0]->num(); memcpy(bottom_diff, top_diff, sizeof(Dtype) * top[0]->count()); @@ -76,7 +76,7 @@ void SoftmaxLayer::Backward_cpu(const vector*>& top, } // subtraction caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - scale_data, sum_multiplier_.cpu_data(), 1., bottom_diff); + scale_data, sum_multiplier_.const_data(), 1., bottom_diff); // elementwise multiplication caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff); } diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index 1a3601aa9e6..19c04f874aa 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -32,13 +32,13 @@ void SoftmaxWithLossLayer::SetUp(const vector*>& bottom, } template -Dtype SoftmaxWithLossLayer::Forward_cpu( +Dtype SoftmaxWithLossLayer::Forward( const vector*>& bottom, vector*>* top) { // The forward pass computes the softmax prob values. softmax_bottom_vec_[0] = bottom[0]; softmax_layer_->Forward(softmax_bottom_vec_, &softmax_top_vec_); - const Dtype* prob_data = prob_.cpu_data(); - const Dtype* label = bottom[1]->cpu_data(); + const Dtype* prob_data = prob_.const_data(); + const Dtype* label = bottom[1]->const_data(); int num = prob_.num(); int dim = prob_.count() / num; Dtype loss = 0; @@ -47,7 +47,7 @@ Dtype SoftmaxWithLossLayer::Forward_cpu( Dtype(FLT_MIN))); } if (top->size() >= 1) { - (*top)[0]->mutable_cpu_data()[0] = loss / num; + (*top)[0]->mutable_data()[0] = loss / num; } if (top->size() == 2) { (*top)[1]->ShareData(prob_); @@ -56,7 +56,7 @@ Dtype SoftmaxWithLossLayer::Forward_cpu( } template -void SoftmaxWithLossLayer::Backward_cpu(const vector*>& top, +void SoftmaxWithLossLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[1]) { @@ -64,17 +64,18 @@ void SoftmaxWithLossLayer::Backward_cpu(const vector*>& top, << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); - const Dtype* prob_data = prob_.cpu_data(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); + const Dtype* prob_data = prob_.const_data(); memcpy(bottom_diff, prob_data, sizeof(Dtype) * prob_.count()); - const Dtype* label = (*bottom)[1]->cpu_data(); + const Dtype* label = (*bottom)[1]->const_data(); int num = prob_.num(); int dim = prob_.count() / num; for (int i = 0; i < num; ++i) { bottom_diff[i * dim + static_cast(label[i])] -= 1; } // Scale down gradient - caffe_scal(prob_.count(), Dtype(1) / num, bottom_diff); + DeviceFactory::GetDevice()->scal(prob_.count(), Dtype(1) / num, + bottom_diff); } } diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/softmax_loss_layer.cu deleted file mode 100644 index e46be6ba85d..00000000000 --- a/src/caffe/layers/softmax_loss_layer.cu +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" - -using std::max; - -namespace caffe { - -template -Dtype SoftmaxWithLossLayer::Forward_gpu( - const vector*>& bottom, vector*>* top) { - // The forward pass computes the softmax prob values. - return Forward_cpu(bottom, top); -} - -template -void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - // TODO(Yangqing): implement the GPU version of softmax. - Backward_cpu(top, propagate_down, bottom); -} - -INSTANTIATE_CLASS(SoftmaxWithLossLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp index 28abd95f5ff..f90ea678b95 100644 --- a/src/caffe/layers/split_layer.cpp +++ b/src/caffe/layers/split_layer.cpp @@ -27,7 +27,7 @@ void SplitLayer::SetUp(const vector*>& bottom, } template -Dtype SplitLayer::Forward_cpu(const vector*>& bottom, +Dtype SplitLayer::Forward(const vector*>& bottom, vector*>* top) { for (int i = 0; i < top->size(); ++i) { (*top)[i]->ShareData(*bottom[0]); @@ -36,15 +36,16 @@ Dtype SplitLayer::Forward_cpu(const vector*>& bottom, } template -void SplitLayer::Backward_cpu(const vector*>& top, +void SplitLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[0]) { (*bottom)[0]->ShareDiff(*top[0]); // Add remaining top blob diffs. - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); for (int i = 1; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->cpu_diff(); - caffe_axpy(count_, Dtype(1.), top_diff, bottom_diff); + const Dtype* top_diff = top[i]->const_diff(); + DeviceFactory::GetDevice()->axpy( + count_, Dtype(1.), top_diff, bottom_diff); } } } diff --git a/src/caffe/layers/split_layer.cu b/src/caffe/layers/split_layer.cu deleted file mode 100644 index 4c921d39f17..00000000000 --- a/src/caffe/layers/split_layer.cu +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -Dtype SplitLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - for (int i = 0; i < top->size(); ++i) { - (*top)[i]->ShareData(*bottom[0]); - } - return Dtype(0.); -} - -template -void SplitLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - if (propagate_down[0]) { - (*bottom)[0]->ShareDiff(*top[0]); - // Add remaining top blob diffs. - Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff(); - for (int i = 1; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff); - } - } -} - - -INSTANTIATE_CLASS(SplitLayer); - -} // namespace caffe diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp index 6b5166d53e9..8dc6ba7b06d 100644 --- a/src/caffe/layers/tanh_layer.cpp +++ b/src/caffe/layers/tanh_layer.cpp @@ -13,8 +13,8 @@ namespace caffe { template Dtype TanHLayer::Forward_cpu(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); Dtype exp2x; const int count = bottom[0]->count(); for (int i = 0; i < count; ++i) { @@ -29,9 +29,9 @@ void TanHLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[0]) { - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + const Dtype* top_data = top[0]->const_data(); + const Dtype* top_diff = top[0]->const_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); const int count = (*bottom)[0]->count(); Dtype tanhx; for (int i = 0; i < count; ++i) { diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp index e6ed8a6b40e..47d58589064 100644 --- a/src/caffe/layers/threshold_layer.cpp +++ b/src/caffe/layers/threshold_layer.cpp @@ -18,8 +18,8 @@ void ThresholdLayer::SetUp(const vector*>& bottom, template Dtype ThresholdLayer::Forward_cpu(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); const int count = bottom[0]->count(); for (int i = 0; i < count; ++i) { top_data[i] = (bottom_data[i] > threshold_) ? Dtype(1) : Dtype(0); diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp index fd4860f98be..a068ff008b4 100644 --- a/src/caffe/layers/window_data_layer.cpp +++ b/src/caffe/layers/window_data_layer.cpp @@ -40,8 +40,8 @@ void* WindowDataLayerPrefetch(void* layer_pointer) { // At each iteration, sample N windows where N*p are foreground (object) // windows and N*(1-p) are background (non-object) windows - Dtype* top_data = layer->prefetch_data_->mutable_cpu_data(); - Dtype* top_label = layer->prefetch_label_->mutable_cpu_data(); + Dtype* top_data = layer->prefetch_data_->mutable_data(); + Dtype* top_label = layer->prefetch_label_->mutable_data(); const Dtype scale = layer->layer_param_.window_data_param().scale(); const int batch_size = layer->layer_param_.window_data_param().batch_size(); const int crop_size = layer->layer_param_.window_data_param().crop_size(); @@ -49,7 +49,7 @@ void* WindowDataLayerPrefetch(void* layer_pointer) { const bool mirror = layer->layer_param_.window_data_param().mirror(); const float fg_fraction = layer->layer_param_.window_data_param().fg_fraction(); - const Dtype* mean = layer->data_mean_.cpu_data(); + const Dtype* mean = layer->data_mean_.const_data(); const int mean_off = (layer->data_mean_.width() - crop_size) / 2; const int mean_width = layer->data_mean_.width(); const int mean_height = layer->data_mean_.height(); @@ -398,12 +398,12 @@ void WindowDataLayer::SetUp(const vector*>& bottom, data_mean_.Reshape(1, channels, crop_size, crop_size); } // Now, start the prefetch thread. Before calling prefetch, we make two - // cpu_data calls so that the prefetch thread does not accidentally make + // const_data calls so that the prefetch thread does not accidentally make // simultaneous cudaMalloc calls when the main thread is running. In some // GPUs this seems to cause failures if we do not so. - prefetch_data_->mutable_cpu_data(); - prefetch_label_->mutable_cpu_data(); - data_mean_.cpu_data(); + prefetch_data_->mutable_data(); + prefetch_label_->mutable_data(); + data_mean_.const_data(); DLOG(INFO) << "Initializing prefetch"; CreatePrefetchThread(); DLOG(INFO) << "Prefetch initialized."; @@ -439,15 +439,17 @@ unsigned int WindowDataLayer::PrefetchRand() { } template -Dtype WindowDataLayer::Forward_cpu(const vector*>& bottom, +Dtype WindowDataLayer::Forward(const vector*>& bottom, vector*>* top) { // First, join the thread JoinPrefetchThread(); // Copy the data - caffe_copy(prefetch_data_->count(), prefetch_data_->cpu_data(), - (*top)[0]->mutable_cpu_data()); - caffe_copy(prefetch_label_->count(), prefetch_label_->cpu_data(), - (*top)[1]->mutable_cpu_data()); + DeviceFactory::GetDevice()->copy_from_cpu( + prefetch_data_->count(), prefetch_data_->const_data(), + (*top)[0]->mutable_data()); + DeviceFactory::GetDevice()->copy_from_cpu( + prefetch_label_->count(), prefetch_label_->const_data(), + (*top)[1]->mutable_data()); // Start a new prefetch thread CreatePrefetchThread(); return Dtype(0.); diff --git a/src/caffe/layers/window_data_layer.cu b/src/caffe/layers/window_data_layer.cu deleted file mode 100644 index bc49fef6545..00000000000 --- a/src/caffe/layers/window_data_layer.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2014 BVLC and contributors. -// -// Based on data_layer.cpp by Yangqing Jia. - -#include -#include - -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/vision_layers.hpp" - -using std::string; -using std::map; -using std::pair; - -// caffe.proto > LayerParameter > WindowDataParameter -// 'source' field specifies the window_file -// 'crop_size' indicates the desired warped size - -namespace caffe { - -template -Dtype WindowDataLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - // First, join the thread - JoinPrefetchThread(); - // Copy the data - CUDA_CHECK(cudaMemcpy((*top)[0]->mutable_gpu_data(), - prefetch_data_->cpu_data(), sizeof(Dtype) * prefetch_data_->count(), - cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy((*top)[1]->mutable_gpu_data(), - prefetch_label_->cpu_data(), sizeof(Dtype) * prefetch_label_->count(), - cudaMemcpyHostToDevice)); - // Start a new prefetch thread - CreatePrefetchThread(); - return Dtype(0.); -} - -INSTANTIATE_CLASS(WindowDataLayer); - -} // namespace caffe diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index f364e6767c6..30c823f843a 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -482,20 +482,10 @@ void Net::Update() { const int count = params_[i]->count(); const Dtype* this_diff; Dtype* owner_diff; - switch (Caffe::mode()) { - case Caffe::CPU: - this_diff = params_[i]->cpu_diff(); - owner_diff = params_[param_owners_[i]]->mutable_cpu_diff(); - caffe_add(count, this_diff, owner_diff, owner_diff); - break; - case Caffe::GPU: - this_diff = params_[i]->gpu_diff(); - owner_diff = params_[param_owners_[i]]->mutable_gpu_diff(); - caffe_gpu_add(count, this_diff, owner_diff, owner_diff); - break; - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - } + this_diff = params_[i]->const_diff(); + owner_diff = params_[param_owners_[i]]->mutable_diff(); + DeviceFactory::GetDevice()->add(count, this_diff, owner_diff, + owner_diff); } // Now, update the owned parameters. for (int i = 0; i < params_.size(); ++i) { diff --git a/src/caffe/opencl_syncedmem.cpp b/src/caffe/opencl_syncedmem.cpp new file mode 100644 index 00000000000..f14d4e8177d --- /dev/null +++ b/src/caffe/opencl_syncedmem.cpp @@ -0,0 +1,133 @@ +// Copyright 2014 BVLC and contributors. + +#ifdef USE_OPENCL +#include + +#include "caffe/common.hpp" +#include "caffe/opencl_syncedmem.hpp" + +namespace caffe { + +OpenCLSyncedMemory::~OpenCLSyncedMemory() { + if (shared_host_ptr_ && this->own_cpu_data_) { + opencl_aligned_free(shared_host_ptr_); + shared_host_ptr_ = NULL; + } + + if (mapped_device_ptr_) { + CL_CHECK(clReleaseMemObject(device_mem_)); + free(mapped_device_ptr_); + mapped_device_ptr_ = NULL; + } +} + +inline void OpenCLSyncedMemory::to_cpu() { + switch (this->head_) { + case UNINITIALIZED: + opencl_aligned_malloc(&shared_host_ptr_, &(this->size_)); + memset(shared_host_ptr_, 0, this->size_); + this->head_ = HEAD_AT_CPU; + this->own_cpu_data_ = true; + break; + case HEAD_AT_GPU: + if (shared_host_ptr_ == NULL) { + opencl_aligned_malloc(&shared_host_ptr_, &(this->size_)); + this->own_cpu_data_ = true; + } + CL_CHECK(clEnqueueReadBuffer( + CaffeOpenCL::queue(), device_mem_, CL_TRUE, 0, + this->size_, shared_host_ptr_, 0, NULL, NULL)); + this->head_ = SYNCED; + break; + case HEAD_AT_CPU: + case SYNCED: + break; + } +} + +inline void OpenCLSyncedMemory::to_gpu() { + switch (this->head_) { + case UNINITIALIZED: +/* + * http://streamcomputing.eu/blog/2013-02-03/opencl-basics-flags-for-the-creating-memory-objects/ + */ + opencl_aligned_malloc(&shared_host_ptr_, &(this->size_)); + cl_int error; + device_mem_ = clCreateBuffer( + CaffeOpenCL::context(), CL_MEM_USE_HOST_PTR, + this->size_, shared_host_ptr_, &error); + CL_CHECK(error); + this->head_ = HEAD_AT_GPU; + break; + case HEAD_AT_CPU: + if (mapped_device_ptr_ == NULL) { + cl_int error; + device_mem_ = clCreateBuffer( + CaffeOpenCL::context(), CL_MEM_USE_HOST_PTR, + this->size_, shared_host_ptr_, &error); + CL_CHECK(error); + mapped_device_ptr_ = clEnqueueMapBuffer( + CaffeOpenCL::queue(), device_mem_, CL_TRUE, + CL_MAP_READ | CL_MAP_WRITE, 0, this->size_, 0, NULL, NULL, &error); + CL_CHECK(error); + } + CL_CHECK(clEnqueueWriteBuffer( + CaffeOpenCL::queue(), device_mem_, CL_TRUE, 0, + this->size_, shared_host_ptr_, 0, NULL, NULL)); + this->head_ = SYNCED; + break; + case HEAD_AT_GPU: + case SYNCED: + break; + } +} + +const void* OpenCLSyncedMemory::cpu_data() { + to_cpu(); + return (const void*)shared_host_ptr_; +} + +void OpenCLSyncedMemory::set_cpu_data(void* data) { + CHECK(data); + if (this->own_cpu_data_) { + CaffeFreeHost(shared_host_ptr_); + } + shared_host_ptr_ = data; + this->head_ = HEAD_AT_CPU; + this->own_cpu_data_ = false; +} + +const void* OpenCLSyncedMemory::gpu_data() { + to_gpu(); + cl_int error; + mapped_device_ptr_ = clEnqueueMapBuffer( + CaffeOpenCL::queue(), device_mem_, CL_TRUE, + CL_MAP_WRITE, 0, this->size_, 0, NULL, NULL, &error); + CL_CHECK(error); + CL_CHECK(clEnqueueUnmapMemObject( + CaffeOpenCL::queue(), device_mem_, mapped_device_ptr_, + 0, NULL, NULL)); + return (const void*)(mapped_device_ptr_); +} + +void* OpenCLSyncedMemory::mutable_cpu_data() { + to_cpu(); + return shared_host_ptr_; +} + +void* OpenCLSyncedMemory::mutable_gpu_data() { + to_gpu(); + cl_int error; + mapped_device_ptr_ = clEnqueueMapBuffer( + CaffeOpenCL::queue(), device_mem_, CL_TRUE, + CL_MAP_READ | CL_MAP_WRITE, 0, this->size_, 0, NULL, NULL, &error); + CL_CHECK(error); + CL_CHECK(clEnqueueUnmapMemObject( + CaffeOpenCL::queue(), device_mem_, mapped_device_ptr_, + 0, NULL, NULL)); + return mapped_device_ptr_; +} + + +} // namespace caffe +#endif // USE_OPENCL diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 769618175ac..3642bf9b40e 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -153,7 +153,7 @@ void Solver::Test(const int test_net_id) { } if (i == 0) { for (int j = 0; j < result.size(); ++j) { - const Dtype* result_vec = result[j]->cpu_data(); + const Dtype* result_vec = result[j]->const_data(); for (int k = 0; k < result[j]->count(); ++k) { test_score.push_back(result_vec[k]); } @@ -161,7 +161,7 @@ void Solver::Test(const int test_net_id) { } else { int idx = 0; for (int j = 0; j < result.size(); ++j) { - const Dtype* result_vec = result[j]->cpu_data(); + const Dtype* result_vec = result[j]->const_data(); for (int k = 0; k < result[j]->count(); ++k) { test_score[idx++] += result_vec[k]; } @@ -272,51 +272,25 @@ void SGDSolver::ComputeUpdateValue() { } Dtype momentum = this->param_.momentum(); Dtype weight_decay = this->param_.weight_decay(); - switch (Caffe::mode()) { - case Caffe::CPU: - for (int param_id = 0; param_id < net_params.size(); ++param_id) { - // Compute the value to history, and then copy them to the blob's diff. - Dtype local_rate = rate * net_params_lr[param_id]; - Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->cpu_diff(), momentum, - history_[param_id]->mutable_cpu_data()); - if (local_decay) { - // add weight decay - caffe_axpy(net_params[param_id]->count(), - local_decay * local_rate, - net_params[param_id]->cpu_data(), - history_[param_id]->mutable_cpu_data()); - } - // copy - caffe_copy(net_params[param_id]->count(), - history_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - } - break; - case Caffe::GPU: - for (int param_id = 0; param_id < net_params.size(); ++param_id) { - // Compute the value to history, and then copy them to the blob's diff. - Dtype local_rate = rate * net_params_lr[param_id]; - Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; - caffe_gpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->gpu_diff(), momentum, - history_[param_id]->mutable_gpu_data()); - if (local_decay) { - // add weight decay - caffe_gpu_axpy(net_params[param_id]->count(), - local_decay * local_rate, - net_params[param_id]->gpu_data(), - history_[param_id]->mutable_gpu_data()); - } - // copy - caffe_gpu_copy(net_params[param_id]->count(), - history_[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); + for (int param_id = 0; param_id < net_params.size(); ++param_id) { + // Compute the value to history, and then copy them to the blob's diff. + Dtype local_rate = rate * net_params_lr[param_id]; + Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; + DeviceFactory::GetDevice()->axpby( + net_params[param_id]->count(), local_rate, + net_params[param_id]->const_diff(), momentum, + history_[param_id]->mutable_data()); + if (local_decay) { + // add weight decay + DeviceFactory::GetDevice()->axpy(net_params[param_id]->count(), + local_decay * local_rate, + net_params[param_id]->const_data(), + history_[param_id]->mutable_data()); } - break; - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + // copy + DeviceFactory::GetDevice()->copy(net_params[param_id]->count(), + history_[param_id]->const_data(), + net_params[param_id]->mutable_diff()); } } diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index fec37d6e9ec..76c99bcf9bc 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -93,6 +93,29 @@ void* SyncedMemory::mutable_gpu_data() { return gpu_ptr_; } +const void* SyncedMemory::const_data() { + switch (Caffe::mode()) { + case Caffe::CPU: + return cpu_data(); + case Caffe::GPU: + return gpu_data(); + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast(NULL); + } +} + +void* SyncedMemory::mutable_data() { + switch (Caffe::mode()) { + case Caffe::CPU: + return mutable_cpu_data(); + case Caffe::GPU: + return mutable_gpu_data(); + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast(NULL); + } +} } // namespace caffe diff --git a/src/caffe/syncedmem_factory.cpp b/src/caffe/syncedmem_factory.cpp new file mode 100644 index 00000000000..5e86403e929 --- /dev/null +++ b/src/caffe/syncedmem_factory.cpp @@ -0,0 +1,24 @@ +// Copyright 2014 BVLC and contributors. + +#include "caffe/syncedmem_factory.hpp" + +namespace caffe { + +AbstractSyncedMemory* GetSyncedMemory(const size_t size) { + switch (Caffe::mode()) { + case Caffe::CPU: + case Caffe::GPU: + return new SyncedMemory(size); +#ifdef USE_OPENCL + case Caffe::OPENCL_CPU: + case Caffe::OPENCL_GPU: + return new OpenCLSyncedMemory(size); +#endif + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast(NULL); + } +} + +} // namespace caffe + diff --git a/src/caffe/test/test_concat_layer.cpp b/src/caffe/test/test_concat_layer.cpp index 72e3c902cf1..66c50723379 100644 --- a/src/caffe/test/test_concat_layer.cpp +++ b/src/caffe/test/test_concat_layer.cpp @@ -84,8 +84,8 @@ TYPED_TEST(ConcatLayerTest, TestSetupChannels) { TYPED_TEST(ConcatLayerTest, TestCPUNum) { LayerParameter layer_param; - ConcatLayer layer(layer_param); Caffe::set_mode(Caffe::CPU); + ConcatLayer layer(layer_param); layer.SetUp(this->blob_bottom_vec_0, &(this->blob_top_vec_)); layer.Forward(this->blob_bottom_vec_0, &(this->blob_top_vec_)); for (int n = 0; n < this->blob_top_->num(); ++n) { @@ -93,7 +93,8 @@ TYPED_TEST(ConcatLayerTest, TestCPUNum) { for (int h = 0; h < this->blob_top_->height(); ++h) { for (int w = 0; w < this->blob_top_->width(); ++w) { EXPECT_EQ(this->blob_top_->data_at(n, c, h, w), - this->blob_bottom_vec_0[0]->data_at(n, c, h, w)); + this->blob_bottom_vec_0[0]->data_at(n, c, h, w)) << + "n " << n << ", c " << c << ", h " << h << ", w " << w; } } } @@ -101,7 +102,8 @@ TYPED_TEST(ConcatLayerTest, TestCPUNum) { for (int h = 0; h < this->blob_top_->height(); ++h) { for (int w = 0; w < this->blob_top_->width(); ++w) { EXPECT_EQ(this->blob_top_->data_at(n, c+3, h, w), - this->blob_bottom_vec_0[1]->data_at(n, c, h, w)); + this->blob_bottom_vec_0[1]->data_at(n, c, h, w)) << + "n " << n << ", c " << c << ", h " << h << ", w " << w; } } } diff --git a/src/caffe/test/test_opencl_math_functions.cpp b/src/caffe/test/test_opencl_math_functions.cpp new file mode 100644 index 00000000000..4ff25d82711 --- /dev/null +++ b/src/caffe/test/test_opencl_math_functions.cpp @@ -0,0 +1,268 @@ +// Copyright 2014 BVLC and contributors. + +#include // for uint32_t & uint64_t +#include +#include +#include // for std::fabs +#include // for rand_r + +#include "gtest/gtest.h" +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/util/opencl_math_functions.hpp" + +#include "caffe/test/test_caffe_main.hpp" + +namespace caffe { + +template +class OpenCLMathFunctionsTest : public ::testing::Test { + protected: + OpenCLMathFunctionsTest() + : blob_bottom_(new Blob()), + blob_bottom2_(new Blob()), + blob_top_(new Blob()) { + } + + virtual void SetUp() { + Caffe::set_random_seed(1701); + this->blob_bottom_->Reshape(11, 17, 19, 23); + this->blob_bottom2_->Reshape(11, 17, 19, 23); + this->blob_top_->Reshape(11, 17, 19, 23); + // fill the values + FillerParameter filler_param; + GaussianFiller filler(filler_param); + filler.Fill(this->blob_bottom_); + filler.Fill(this->blob_bottom2_); + filler.Fill(this->blob_top_); + } + + virtual ~OpenCLMathFunctionsTest() { + delete blob_bottom_; + delete blob_bottom2_; + delete blob_top_; + } + + Blob* const blob_bottom_; + Blob* const blob_bottom2_; + Blob* const blob_top_; +}; + +typedef ::testing::Types Dtypes; +TYPED_TEST_CASE(OpenCLMathFunctionsTest, Dtypes); + +TYPED_TEST(OpenCLMathFunctionsTest, TestNothing) { + // The first test case of a test suite takes the longest time + // due to the set up overhead. +} + +// TODO: Fix caffe_opencl_hamming_distance and re-enable this test. +TYPED_TEST(OpenCLMathFunctionsTest, DISABLED_TestHammingDistanceOpenCL) { + int n = this->blob_bottom_->count(); + const TypeParam* x = this->blob_bottom_->cpu_data(); + const TypeParam* y = this->blob_top_->cpu_data(); + int reference_distance = this->ReferenceHammingDistance(n, x, y); + x = this->blob_bottom_->opencl_data(); + y = this->blob_top_->opencl_data(); + int computed_distance = caffe_opencl_hamming_distance(n, x, y); + EXPECT_EQ(reference_distance, computed_distance); +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestAsumOpenCL) { + int n = this->blob_bottom_->count(); + const TypeParam* x = this->blob_bottom_->cpu_data(); + TypeParam std_asum = 0; + for (int i = 0; i < n; ++i) { + std_asum += std::fabs(x[i]); + } + TypeParam opencl_asum; + caffe_opencl_asum(n, this->blob_bottom_->opencl_data(), &opencl_asum); + EXPECT_LT((opencl_asum - std_asum) / std_asum, 1e-2); +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestSignOpenCL) { + int n = this->blob_bottom_->count(); + caffe_opencl_sign(n, this->blob_bottom_->opencl_data(), + this->blob_bottom_->mutable_opencl_diff()); + const TypeParam* signs = this->blob_bottom_->cpu_diff(); + const TypeParam* x = this->blob_bottom_->cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(signs[i], x[i] > 0 ? 1 : (x[i] < 0 ? -1 : 0)); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestSgnbitOpenCL) { + int n = this->blob_bottom_->count(); + caffe_opencl_sgnbit(n, this->blob_bottom_->opencl_data(), + this->blob_bottom_->mutable_opencl_diff()); + const TypeParam* signbits = this->blob_bottom_->cpu_diff(); + const TypeParam* x = this->blob_bottom_->cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(signbits[i], x[i] < 0 ? 1 : 0); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestFabsOpenCL) { + int n = this->blob_bottom_->count(); + caffe_opencl_fabs(n, this->blob_bottom_->opencl_data(), + this->blob_bottom_->mutable_opencl_diff()); + const TypeParam* abs_val = this->blob_bottom_->cpu_diff(); + const TypeParam* x = this->blob_bottom_->cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(abs_val[i], x[i] > 0 ? x[i] : -x[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestScaleOpenCL) { + int n = this->blob_bottom_->count(); + TypeParam alpha = this->blob_bottom_->cpu_diff()[caffe_rng_rand() % + this->blob_bottom_->count()]; + caffe_opencl_scale(n, alpha, this->blob_bottom_->opencl_data(), + this->blob_bottom_->mutable_opencl_diff()); + const TypeParam* scaled = this->blob_bottom_->cpu_diff(); + const TypeParam* x = this->blob_bottom_->cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(scaled[i], x[i] * alpha); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestCopyFromCPU) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->cpu_data(); + TypeParam* top_data = this->blob_top_->mutable_cpu_data(); + caffe_opencl_copy_from_cpu(n, bottom_data, top_data); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestCopyOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_copy(n, bottom_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestSqrOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_sqr(n, bottom_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestExpOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_exp(n, bottom_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestSignOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_sign(n, bottom_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestSgnbitOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_sgnbit(n, bottom_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestFabsOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_fabs(n, bottom_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestAddOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + const TypeParam* bottom2_data = this->blob_bottom2_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_add(n, bottom_data, bottom2_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + bottom2_data = this->blob_bottom2_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i] + bottom2_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestSubOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + const TypeParam* bottom2_data = this->blob_bottom2_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_sub(n, bottom_data, bottom2_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + bottom2_data = this->blob_bottom2_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i] - bottom2_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestMulOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + const TypeParam* bottom2_data = this->blob_bottom2_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_mul(n, bottom_data, bottom2_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + bottom2_data = this->blob_bottom2_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i] * bottom2_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestDivOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + const TypeParam* bottom2_data = this->blob_bottom2_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_div(n, bottom_data, bottom2_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + bottom2_data = this->blob_bottom2_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i] / std::min(bottom2_data[i], 1e-5), top_data[i]); + } +} + +} // namespace caffe diff --git a/src/caffe/test/test_opencl_syncedmem.cpp b/src/caffe/test/test_opencl_syncedmem.cpp new file mode 100644 index 00000000000..0724298356b --- /dev/null +++ b/src/caffe/test/test_opencl_syncedmem.cpp @@ -0,0 +1,90 @@ +// Copyright 2014 BVLC and contributors. + +#include +#include + +#include "cuda_runtime.h" +#include "gtest/gtest.h" +#include "caffe/common.hpp" +#include "caffe/opencl_syncedmem.hpp" + +#include "caffe/test/test_caffe_main.hpp" + +namespace caffe { + +class OpenCLSyncedMemoryTest : public ::testing::Test {}; + +TEST_F(OpenCLSyncedMemoryTest, TestInitialization) { + OpenCLSyncedMemory mem(10); + EXPECT_EQ(mem.head(), OpenCLSyncedMemory::UNINITIALIZED); + EXPECT_EQ(mem.size(), 10); + OpenCLSyncedMemory* p_mem = new OpenCLSyncedMemory(10 * sizeof(float)); + EXPECT_EQ(p_mem->size(), 10 * sizeof(float)); + delete p_mem; +} + +TEST_F(OpenCLSyncedMemoryTest, TestAllocation) { + OpenCLSyncedMemory mem(10); + EXPECT_TRUE(mem.cpu_data()); + EXPECT_TRUE(mem.gpu_data()); + EXPECT_TRUE(mem.mutable_cpu_data()); + EXPECT_TRUE(mem.mutable_gpu_data()); +} + +TEST_F(OpenCLSyncedMemoryTest, TestCPUWrite) { + OpenCLSyncedMemory mem(10); + void* cpu_data = mem.mutable_cpu_data(); + EXPECT_EQ(mem.head(), OpenCLSyncedMemory::HEAD_AT_CPU); + memset(cpu_data, 1, mem.size()); + for (int i = 0; i < mem.size(); ++i) { + EXPECT_EQ((reinterpret_cast(cpu_data))[i], 1); + } + const void* gpu_data = mem.gpu_data(); + EXPECT_EQ(mem.head(), OpenCLSyncedMemory::SYNCED); + // check if values are the same + char* recovered_value = new char[10]; + cudaMemcpy(reinterpret_cast(recovered_value), gpu_data, 10, + cudaMemcpyDeviceToHost); + for (int i = 0; i < mem.size(); ++i) { + EXPECT_EQ((reinterpret_cast(recovered_value))[i], 1); + } + // do another round + cpu_data = mem.mutable_cpu_data(); + EXPECT_EQ(mem.head(), OpenCLSyncedMemory::HEAD_AT_CPU); + memset(cpu_data, 2, mem.size()); + for (int i = 0; i < mem.size(); ++i) { + EXPECT_EQ((reinterpret_cast(cpu_data))[i], 2); + } + gpu_data = mem.gpu_data(); + EXPECT_EQ(mem.head(), OpenCLSyncedMemory::SYNCED); + // check if values are the same + cudaMemcpy(reinterpret_cast(recovered_value), gpu_data, 10, + cudaMemcpyDeviceToHost); + for (int i = 0; i < mem.size(); ++i) { + EXPECT_EQ((reinterpret_cast(recovered_value))[i], 2); + } + delete[] recovered_value; +} + +TEST_F(OpenCLSyncedMemoryTest, TestGPUWrite) { + OpenCLSyncedMemory mem(10); + void* gpu_data = mem.mutable_gpu_data(); + EXPECT_EQ(mem.head(), OpenCLSyncedMemory::HEAD_AT_GPU); + CUDA_CHECK(cudaMemset(gpu_data, 1, mem.size())); + const void* cpu_data = mem.cpu_data(); + for (int i = 0; i < mem.size(); ++i) { + EXPECT_EQ((reinterpret_cast(cpu_data))[i], 1); + } + EXPECT_EQ(mem.head(), OpenCLSyncedMemory::SYNCED); + + gpu_data = mem.mutable_gpu_data(); + EXPECT_EQ(mem.head(), OpenCLSyncedMemory::HEAD_AT_GPU); + CUDA_CHECK(cudaMemset(gpu_data, 2, mem.size())); + cpu_data = mem.cpu_data(); + for (int i = 0; i < mem.size(); ++i) { + EXPECT_EQ((reinterpret_cast(cpu_data))[i], 2); + } + EXPECT_EQ(mem.head(), OpenCLSyncedMemory::SYNCED); +} + +} // namespace caffe diff --git a/src/caffe/util/cpu_device.cpp b/src/caffe/util/cpu_device.cpp new file mode 100644 index 00000000000..72e3af12025 --- /dev/null +++ b/src/caffe/util/cpu_device.cpp @@ -0,0 +1,175 @@ +// Copyright 2014 BVLC and contributors. + +#include "caffe/common.hpp" +#include "caffe/util/device.hpp" + +namespace caffe { +template +void CPUDevice::gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, + const int N, const int K, const Dtype alpha, + const Dtype* A, const Dtype* B, + const Dtype beta, Dtype* C) { + caffe_cpu_gemm(TransA, TransB, M, N, K, alpha, A, B, beta, C); +} + +template +void CPUDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const Dtype alpha, const Dtype* A, + const Dtype* x, const Dtype beta, Dtype* y) { + caffe_cpu_gemv(TransA, M, N, alpha, A, x, beta, y); +} + +template +void CPUDevice::axpy(const int N, const Dtype alpha, const Dtype* X, + Dtype* Y) { + caffe_axpy(N, alpha, X, Y); +} + +template +void CPUDevice::axpby(const int N, const Dtype alpha, + const Dtype* X, const Dtype beta, Dtype* Y) { + caffe_cpu_axpby(N, alpha, X, beta, Y); +} + +template +void CPUDevice::copy(const int N, const Dtype *X, Dtype *Y) { + caffe_copy(N, X, Y); +} + +template +void CPUDevice::copy_from_cpu(const int N, const Dtype *X, Dtype *Y) { + caffe_copy(N, X, Y); +} + +template +void CPUDevice::set(const int N, const Dtype alpha, Dtype *X) { + caffe_set(N, alpha, X); +} + +template +void CPUDevice::add_scalar(const int N, const Dtype alpha, + Dtype *X) { + caffe_add_scalar(N, alpha, X); +} + +template +void CPUDevice::scal(const int N, const Dtype alpha, Dtype *X) { + caffe_scal(N, alpha, X); +} + +template +void CPUDevice::sqr(const int N, const Dtype* a, Dtype* y) { + caffe_sqr(N, a, y); +} + +template +void CPUDevice::add(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + caffe_add(N, a, b, y); +} + +template +void CPUDevice::sub(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + caffe_sub(N, a, b, y); +} + +template +void CPUDevice::mul(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + caffe_mul(N, a, b, y); +} + +template +void CPUDevice::div(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + caffe_div(N, a, b, y); +} + +template +void CPUDevice::powx(const int N, const Dtype* a, const Dtype b, + Dtype* y) { + caffe_powx(N, a, b, y); +} + +template +void CPUDevice::rng_uniform(const int N, const Dtype a, + const Dtype b, Dtype* r) { + caffe_rng_uniform(N, a, b, r); +} + +template +void CPUDevice::rng_gaussian(const int N, const Dtype mu, + const Dtype sigma, Dtype* r) { + caffe_rng_gaussian(N, mu, sigma, r); +} + +template +void CPUDevice::rng_bernoulli(const int N, const Dtype p, int* r) { + caffe_rng_bernoulli(N, p, r); +} + +template +void CPUDevice::exp(const int N, const Dtype* a, Dtype* y) { + caffe_exp(N, a, y); +} + +template +void CPUDevice::dot(const int N, const Dtype* x, const Dtype* y, + Dtype* out) { + *out = caffe_cpu_dot(N, x, y); +} + +template +void CPUDevice::hamming_distance(const int N, const Dtype* x, + const Dtype* y, uint32_t* out) { + *out = caffe_cpu_hamming_distance(N, x, y); +} + +template +// Returns the sum of the absolute values of the elements of vector x +void CPUDevice::asum(const int N, const Dtype* x, Dtype* y) { + *y = caffe_cpu_asum(N, x); +} + +template +void CPUDevice::sign(const int N, const Dtype* x, Dtype* y) { + caffe_cpu_sign(N, x, y); +} + +template +void CPUDevice::sgnbit(const int N, const Dtype* x, Dtype* y) { + caffe_gpu_sgnbit(N, x, y); +} + +template +void CPUDevice::fabs(const int N, const Dtype* x, Dtype* y) { + caffe_cpu_fabs(N, x, y); +} + +template +void CPUDevice::scale(const int N, const Dtype alpha, + const Dtype *x, Dtype* y) { + caffe_cpu_scale(N, alpha, x, y); +} + +template +void CPUDevice::im2col(const Dtype* data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col) { + im2col_cpu(data_im, channels, height, width, ksize, pad, stride, + data_col); +} + +template +void CPUDevice::col2im(const Dtype* data_col, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, Dtype* data_im) { + col2im_cpu(data_col, channels, height, width, psize, pad, stride, + data_im); +} + +INSTANTIATE_CLASS(CPUDevice); + +} // namespace caffe diff --git a/src/caffe/util/device.cpp b/src/caffe/util/device.cpp new file mode 100644 index 00000000000..bb25372122f --- /dev/null +++ b/src/caffe/util/device.cpp @@ -0,0 +1,42 @@ +// Copyright 2014 BVLC and contributors. + +#include "caffe/common.hpp" +#include "caffe/util/device.hpp" +#ifdef USE_OPENCL +#include "caffe/util/opencl_device.hpp" +#endif + +namespace caffe { + +template +Device* DeviceFactory::GetDevice() { + switch (Caffe::mode()) { + case Caffe::CPU: + return cpu_device_; + case Caffe::GPU: + return gpu_device_; +#ifdef USE_OPENCL + case Caffe::OPENCL_CPU: + case Caffe::OPENCL_GPU: + return opencl_device_; +#endif + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast*>(NULL); + } +} + +template +Device* DeviceFactory::cpu_device_ = new CPUDevice(); + +template +Device* DeviceFactory::gpu_device_ = new GPUDevice(); + +#ifdef USE_OPENCL +template +Device* DeviceFactory::opencl_device_ = new OpenCLDevice(); +#endif + +INSTANTIATE_CLASS(DeviceFactory); + +} // namespace caffe diff --git a/src/caffe/util/gpu_device.cpp b/src/caffe/util/gpu_device.cpp new file mode 100644 index 00000000000..d5083801b55 --- /dev/null +++ b/src/caffe/util/gpu_device.cpp @@ -0,0 +1,178 @@ +// Copyright 2014 BVLC and contributors. + +#include "caffe/common.hpp" +#include "caffe/util/device.hpp" + +namespace caffe { +template +void GPUDevice::gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, + const int N, const int K, const Dtype alpha, + const Dtype* A, const Dtype* B, + const Dtype beta, Dtype* C) { + caffe_gpu_gemm(TransA, TransB, M, N, K, alpha, A, B, beta, C); +} + +template +void GPUDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const Dtype alpha, const Dtype* A, + const Dtype* x, const Dtype beta, Dtype* y) { + caffe_gpu_gemv(TransA, M, N, alpha, A, x, beta, y); +} + +template +void GPUDevice::axpy(const int N, const Dtype alpha, const Dtype* X, + Dtype* Y) { + caffe_gpu_axpy(N, alpha, X, Y); +} + +template +void GPUDevice::axpby(const int N, const Dtype alpha, + const Dtype* X, const Dtype beta, Dtype* Y) { + caffe_gpu_axpby(N, alpha, X, beta, Y); +} + +template +void GPUDevice::copy(const int N, const Dtype *X, Dtype *Y) { + caffe_gpu_copy(N, X, Y); +} + +template +void GPUDevice::copy_from_cpu(const int N, const Dtype *X, Dtype *Y) { + CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyHostToDevice)); +} + +template +void GPUDevice::set(const int N, const Dtype alpha, Dtype *X) { + caffe_gpu_set(N, alpha, X); +} + +template +void GPUDevice::add_scalar(const int N, const Dtype alpha, + Dtype *X) { + caffe_gpu_add_scalar(N, alpha, X); +} + +template +void GPUDevice::scal(const int N, const Dtype alpha, Dtype *X) { + caffe_gpu_scal(N, alpha, X); +} + +template +void GPUDevice::sqr(const int N, const Dtype* a, Dtype* y) { + NOT_IMPLEMENTED; +// caffe_gpu_sqr(N, a, y); +} + +template +void GPUDevice::add(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + caffe_gpu_add(N, a, b, y); +} + +template +void GPUDevice::sub(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + caffe_gpu_sub(N, a, b, y); +} + +template +void GPUDevice::mul(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + caffe_gpu_mul(N, a, b, y); +} + +template +void GPUDevice::div(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + caffe_gpu_div(N, a, b, y); +} + +template +void GPUDevice::powx(const int N, const Dtype* a, const Dtype b, + Dtype* y) { + caffe_gpu_powx(N, a, b, y); +} + +template +void GPUDevice::rng_uniform(const int N, const Dtype a, + const Dtype b, Dtype* r) { + caffe_gpu_rng_uniform(N, a, b, r); +} + +template +void GPUDevice::rng_gaussian(const int N, const Dtype mu, + const Dtype sigma, Dtype* r) { + caffe_gpu_rng_gaussian(N, mu, sigma, r); +} + +template +void GPUDevice::rng_bernoulli(const int N, const Dtype p, int* r) { + NOT_IMPLEMENTED; +// caffe_gpu_rng_bernoulli(N, p, r); +} + +template +void GPUDevice::exp(const int N, const Dtype* a, Dtype* y) { + NOT_IMPLEMENTED; +// caffe_gpu_exp(N, a, y); +} + +template +void GPUDevice::dot(const int N, const Dtype* x, const Dtype* y, + Dtype* out) { + caffe_gpu_dot(N, x, y, out); +} + +template +void GPUDevice::hamming_distance(const int N, const Dtype* x, + const Dtype* y, uint32_t* out) { + *out = caffe_gpu_hamming_distance(N, x, y); +} + +template +// Returns the sum of the absolute values of the elements of vector x +void GPUDevice::asum(const int N, const Dtype* x, Dtype* y) { + caffe_gpu_asum(N, x, y); +} + +template +void GPUDevice::sign(const int N, const Dtype* x, Dtype* y) { + caffe_gpu_sign(N, x, y); +} + +template +void GPUDevice::sgnbit(const int N, const Dtype* x, Dtype* y) { + caffe_gpu_sgnbit(N, x, y); +} + +template +void GPUDevice::fabs(const int N, const Dtype* x, Dtype* y) { + caffe_gpu_fabs(N, x, y); +} + +template +void GPUDevice::scale(const int N, const Dtype alpha, + const Dtype *x, Dtype* y) { + caffe_gpu_scale(N, alpha, x, y); +} + +template +void GPUDevice::im2col(const Dtype* data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col) { + im2col_gpu(data_im, channels, height, width, ksize, pad, stride, + data_col); +} + +template +void GPUDevice::col2im(const Dtype* data_col, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, Dtype* data_im) { + col2im_gpu(data_col, channels, height, width, psize, pad, stride, + data_im); +} + +INSTANTIATE_CLASS(GPUDevice); + +} // namespace caffe diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp new file mode 100644 index 00000000000..a1099e11100 --- /dev/null +++ b/src/caffe/util/opencl_device.cpp @@ -0,0 +1,383 @@ +// Copyright 2014 BVLC and contributors. + +#ifdef USE_OPENCL +#include "caffe/common.hpp" +#include "caffe/util/opencl_device.hpp" +#include "caffe/util/opencl_math_functions.hpp" + +#include + +namespace caffe { + +shared_ptr CaffeOpenCL::singleton_; + +cl_device_type CaffeOpenCL::get_device_type() { + switch (Caffe::mode()) { + case Caffe::OPENCL_CPU: + return CL_DEVICE_TYPE_CPU; + case Caffe::OPENCL_GPU: + return CL_DEVICE_TYPE_GPU; + case Caffe::OPENCL_ALL: + default: + return CL_DEVICE_TYPE_ALL; + } +} + +/** + * http://dhruba.name/2012/08/14/opencl-cookbook-listing-all-devices-and-their-critical-attributes/ + */ +void CaffeOpenCL::create_context() { + cl_uint platformCount; + CL_CHECK(clGetPlatformIDs(0, NULL, &platformCount)); + + cl_platform_id* platforms = (cl_platform_id*) + malloc(sizeof(cl_platform_id) * platformCount); + CL_CHECK(clGetPlatformIDs(1, platforms, NULL)); + + cl_uint device_count; + cl_device_type device_type = get_device_type(); + int num_devices_to_skip = current_device_id_; + while (num_devices_to_skip >= 0) { + for (int i = 0; i < platformCount; i++) { + cl_context_properties properties[] = { + CL_CONTEXT_PLATFORM, (cl_context_properties)( + platforms[i]), 0}; + // get all devices + clGetDeviceIDs(platforms[i], device_type, 0, NULL, &device_count); + if (num_devices_to_skip <= device_count) { + current_cl_platform_id_ = platforms[i]; + current_platform_device_count_ = device_count; + current_platform_device_id_ = num_devices_to_skip; + current_platform_device_ids_.resize(device_count); + CL_CHECK(clGetDeviceIDs(current_cl_platform_id_, device_type, + current_platform_device_count_, + &(current_platform_device_ids_[0]), NULL)); + cl_int error = CL_SUCCESS; // Used to handle error codes +/* + * http://dhruba.name/2012/10/14/opencl-cookbook-how-to-leverage-multiple-devices-in-opencl/ + * https://software.intel.com/sites/products/documentation/ioclsdk/2013/OG/Using_Shared_Context_for_Multiple_OpenCL_Devices.htm + */ + cl_context_ = clCreateContext( + properties, device_count, &(current_platform_device_ids_[0]), + NULL, NULL, &error); + CL_CHECK(error); + } + num_devices_to_skip -= device_count; + if (num_devices_to_skip < 0) { + break; + } + } + } +} + +cl_device_id CaffeOpenCL::current_cl_device_id() { + // To initialize current platform info + context(); + return current_platform_device_ids_[current_platform_device_id_]; +} + +void CaffeOpenCL::create_queue() { + cl_int error = 0; // Used to handle error codes + cl_command_queue_properties properties = 0; + cl_command_queue_ = clCreateCommandQueue( + context(), current_cl_device_id(), properties, &error); + CL_CHECK(error); +} + +void CaffeOpenCL::release_context() { + CL_CHECK(clReleaseContext(cl_context_)); + cl_context_ = NULL; +} + +void CaffeOpenCL::release_queue() { + CL_CHECK(clReleaseCommandQueue(cl_command_queue_)); + cl_command_queue_ = NULL; +} + +void CaffeOpenCL::SetDevice(const int device_id) { + if (current_device_id_ != device_id) { + current_device_id_ = device_id; + release_queue(); + // TODO: reuse context for the devices of the same platform + release_context(); + context(); + finalize_clblas(); + initialize_clblas(); + } +} + +void CaffeOpenCL::initialize_clblas() { + if (!clblas_initialized_) { + CLBLAS_CHECK(clblasSetup()); + clblas_initialized_ = true; + } +} + +void CaffeOpenCL::finalize_clblas() { + if (clblas_initialized_) { + clblasTeardown(); + clblas_initialized_ = false; + } +} + +template +void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, + const int N, const int K, const Dtype alpha, + const Dtype* A, const Dtype* B, + const Dtype beta, Dtype* C) { + caffe_opencl_gemm(TransA, TransB, M, N, K, alpha, A, B, beta, C); +} + +template +void OpenCLDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const Dtype alpha, const Dtype* A, + const Dtype* x, const Dtype beta, Dtype* y) { + caffe_opencl_gemv(TransA, M, N, alpha, A, x, beta, y); +} + +template +void OpenCLDevice::axpy(const int N, const Dtype alpha, + const Dtype* X, Dtype* Y) { + caffe_opencl_axpy(N, alpha, X, Y); +} + +template +void OpenCLDevice::scal(const int N, const Dtype alpha, Dtype *X) { + caffe_opencl_scal(N, alpha, X); +} + +template +void OpenCLDevice::axpby( + const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y) { + caffe_opencl_axpby(N, alpha, X, beta, Y); +} + +template +void OpenCLDevice::copy(const int N, const Dtype *X, Dtype *Y) { + caffe_opencl_copy(N, X, Y); +} + + +/** + * http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clEnqueueWriteBuffer.html + */ +template +void OpenCLDevice::copy_from_cpu(const int N, const Dtype *X, + Dtype *Y) { + caffe_opencl_copy_from_cpu(N, X, Y); +} + +/** + * http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clEnqueueFillBuffer.html + */ +template +void OpenCLDevice::set(const int N, const Dtype alpha, Dtype *X) { + caffe_opencl_set(N, alpha, X); +} + + +//template +//void OpenCLDevice::add_scalar(const int N, const Dtype alpha, +// Dtype *X) { +// NOT_IMPLEMENTED; +//} + +//template +//void OpenCLDevice::powx(const int N, const Dtype* a, const Dtype b, +// Dtype* y) { +// NOT_IMPLEMENTED; +//// caffe_gpu_powx(N, a, b, y); +//} + +//template +//void OpenCLDevice::rng_uniform(const int N, const Dtype a, +// const Dtype b, Dtype* r) { +// NOT_IMPLEMENTED; +//// caffe_gpu_rng_uniform(N, a, b, r); +//} + +//template +//void OpenCLDevice::rng_gaussian(const int N, const Dtype mu, +// const Dtype sigma, Dtype* r) { +// NOT_IMPLEMENTED; +//// caffe_gpu_rng_gaussian(N, mu, sigma, r); +//} + +//template +//void OpenCLDevice::rng_bernoulli(const int N, const Dtype p, int* r) { +// NOT_IMPLEMENTED; +//// caffe_gpu_rng_bernoulli(N, p, r); +//} + +//template +//void OpenCLDevice::dot(const int N, const Dtype* x, const Dtype* y, +// Dtype* out) { +// NOT_IMPLEMENTED; +//// caffe_gpu_dot(N, x, y, out); +//} + +//template +//void OpenCLDevice::hamming_distance(const int N, const Dtype* x, +// const Dtype* y, uint32_t* out) { +// NOT_IMPLEMENTED; +//// *out = caffe_gpu_hamming_distance(N, x, y); +//} + +/** + * +clblasSasum( + size_t N, + cl_mem asum, + size_t offAsum, + const cl_mem X, + size_t offx, + int incx, + cl_mem scratchBuff, + cl_uint numCommandQueues, + cl_command_queue *commandQueues, + cl_uint numEventsInWaitList, + const cl_event *eventWaitList, + cl_event *events) + */ +//template +//void OpenCLDevice::asum(const int N, const Dtype* x, Dtype* y) { +// NOT_IMPLEMENTED; +//// CREATE_CL_MEM(x, N, 1, READ_ONLY); +//// CREATE_CL_MEM(y, N, 1, READ_WRITE); +//// PRE_CLBLAS_CALL; +//// CLBLAS_CHECK(clblasSasum( +//// N, alpha, ARRAY(X), +//// CLBALS_TRAILING_ARGS)); +//} + +//template +//void OpenCLDevice::scale(const int N, const Dtype alpha, +// const Dtype *x, Dtype* y) { +// this->copy(N, x, y); +// this->scal(N, alpha, y); +//} + +//template +//void OpenCLDevice::im2col( +// const Dtype* data_im, const int channels, +// const int height, const int width, const int ksize, const int pad, +// const int stride, Dtype* data_col) { +//// NOT_IMPLEMENTED; +//// im2col_gpu(data_im, channels, height, width, ksize, pad, stride, +//// data_col); +//} + +//template +//void OpenCLDevice::col2im( +// const Dtype* data_col, const int channels, +// const int height, const int width, const int psize, const int pad, +// const int stride, Dtype* data_im) { +//// NOT_IMPLEMENTED; +//// col2im_gpu(data_col, channels, height, width, psize, pad, stride, +//// data_im); +//} + + +INSTANTIATE_CLASS(OpenCLDevice); + +const char* clGetErrorString(cl_int error) { + switch (error) { + case CL_SUCCESS: + return "CL_SUCCESS"; + case CL_INVALID_VALUE: + return "CL_INVALID_VALUE"; + case CL_INVALID_COMMAND_QUEUE: + return "CL_INVALID_COMMAND_QUEUE"; + case CL_INVALID_CONTEXT: + return "CL_INVALID_CONTEXT"; + case CL_INVALID_MEM_OBJECT: + return "CL_INVALID_MEM_OBJECT"; + case CL_INVALID_DEVICE: + return "CL_INVALID_DEVICE"; + case CL_INVALID_EVENT_WAIT_LIST: + return "CL_INVALID_EVENT_WAIT_LIST"; + case CL_OUT_OF_RESOURCES: + return "CL_OUT_OF_RESOURCES"; + case CL_OUT_OF_HOST_MEMORY: + return "CL_OUT_OF_HOST_MEMORY"; + case CL_INVALID_OPERATION: + return "CL_INVALID_OPERATION"; + case CL_COMPILER_NOT_AVAILABLE: + return "CL_COMPILER_NOT_AVAILABLE"; + case CL_BUILD_PROGRAM_FAILURE: + return "CL_BUILD_PROGRAM_FAILURE"; + } + return "Unknown OpenCL error"; +} + +const char* clblasGetErrorString(clblasStatus status) { + switch (status) { + case clblasSuccess: + return "clblasSuccess"; + case clblasInvalidValue: + return "clblasInvalidValue"; + case clblasInvalidCommandQueue: + return "clblasInvalidCommandQueue"; + case clblasInvalidContext: + return "clblasInvalidContext"; + case clblasInvalidMemObject: + return "clblasInvalidMemObject"; + case clblasInvalidDevice: + return "clblasInvalidDevice"; + case clblasInvalidEventWaitList: + return "clblasInvalidEventWaitList"; + case clblasOutOfResources: + return "clblasOutOfResources"; + case clblasOutOfHostMemory: + return "clblasOutOfHostMemory"; + case clblasInvalidOperation: + return "clblasInvalidOperation"; + case clblasCompilerNotAvailable: + return "clblasCompilerNotAvailable"; + case clblasBuildProgramFailure: + return "clblasBuildProgramFailure"; + case clblasNotImplemented: + return "clblasNotImplemented"; + case clblasNotInitialized: + return "clblasNotInitialized"; + case clblasInvalidMatA: + return "clblasInvalidMatA"; + case clblasInvalidMatB: + return "clblasInvalidMatB"; + case clblasInvalidMatC: + return "clblasInvalidMatC"; + case clblasInvalidVecX: + return "clblasInvalidVecX"; + case clblasInvalidVecY: + return "clblasInvalidVecY"; + case clblasInvalidDim: + return "clblasInvalidDim"; + case clblasInvalidLeadDimA: + return "clblasInvalidLeadDimA"; + case clblasInvalidLeadDimB: + return "clblasInvalidLeadDimB"; + case clblasInvalidLeadDimC: + return "clblasInvalidLeadDimC"; + case clblasInvalidIncX: + return "clblasInvalidIncX"; + case clblasInvalidIncY: + return "clblasInvalidIncY"; + case clblasInsufficientMemMatA: + return "clblasInsufficientMemMatA"; + case clblasInsufficientMemMatB: + return "clblasInsufficientMemMatB"; + case clblasInsufficientMemMatC: + return "clblasInsufficientMemMatC"; + case clblasInsufficientMemVecX: + return "clblasInsufficientMemVecX"; + case clblasInsufficientMemVecY: + return "clblasInsufficientMemVecY"; + } + return "Unknown clblas status"; +} + +} // namespace caffe + +#endif // USE_OPENCL diff --git a/src/caffe/util/opencl_math_functions.cpp b/src/caffe/util/opencl_math_functions.cpp new file mode 100644 index 00000000000..7f9cbc0f3a3 --- /dev/null +++ b/src/caffe/util/opencl_math_functions.cpp @@ -0,0 +1,234 @@ +// Copyright 2014 BVLC and contributors. + +#ifdef USE_OPENCL +//#include "caffe/common.hpp" +#include "caffe/util/opencl_math_functions.hpp" + +namespace caffe { + +template <> +void caffe_opencl_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + int ldA = (TransA == CblasNoTrans) ? K : M; + int ldB = (TransB == CblasNoTrans) ? N : K; + int ldC = N; + clblasTranspose clTransA = to_clblasTranspose(TransA); + clblasTranspose clTransB = to_clblasTranspose(TransB); + CREATE_CL_MEM(A, M, K, READ_ONLY); + CREATE_CL_MEM(B, K, N, READ_ONLY); + CREATE_CL_MEM(C, M, N, READ_WRITE); + ENQUEUE_CL_BUFFER(Write, A, M, K); + ENQUEUE_CL_BUFFER(Write, B, K, N); + ENQUEUE_CL_BUFFER(Write, C, M, N); + PRE_CLBLAS_CALL; + // bufX is defined by the macro CREATE_CL_MEM(X, ...) + CLBLAS_CHECK(clblasSgemm(clblasRowMajor, clTransA, clTransB, + M, N, K, alpha, ARRAY(A), ARRAY(B), beta, ARRAY(C), + CLBLAS_TRAILING_ARGS)); + /* Release OpenCL memory objects. */ + RELEASE_CL_MEM(C); + RELEASE_CL_MEM(B); + RELEASE_CL_MEM(A); +} + +template <> +void caffe_opencl_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { + int ldA = (TransA == CblasNoTrans) ? K : M; + int ldB = (TransB == CblasNoTrans) ? N : K; + int ldC = N; + clblasTranspose clTransA = to_clblasTranspose(TransA); + clblasTranspose clTransB = to_clblasTranspose(TransB); + CREATE_CL_MEM(A, M, K, READ_ONLY); + CREATE_CL_MEM(B, K, N, READ_ONLY); + CREATE_CL_MEM(C, M, N, READ_WRITE); + ENQUEUE_CL_BUFFER(Write, A, M, K); + ENQUEUE_CL_BUFFER(Write, B, K, N); + ENQUEUE_CL_BUFFER(Write, C, M, N); + PRE_CLBLAS_CALL; + // bufX is defined by the macro CREATE_CL_MEM(X, ...) + CLBLAS_CHECK(clblasDgemm(clblasRowMajor, clTransA, clTransB, + M, N, K, alpha, ARRAY(A), ARRAY(B), beta, ARRAY(C), + CLBLAS_TRAILING_ARGS)); + /* Release OpenCL memory objects. */ + RELEASE_CL_MEM(C); + RELEASE_CL_MEM(B); + RELEASE_CL_MEM(A); +} + +template <> +void caffe_opencl_gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const float alpha, const float* A, const float* x, + const float beta, float* y) { + clblasTranspose clTransA = to_clblasTranspose(TransA); + int ldA = (TransA == CblasNoTrans) ? N : M; + int ldx = N; + int ldy = N; + CREATE_CL_MEM(A, M, N, READ_ONLY); + CREATE_CL_MEM(x, N, 1, READ_ONLY); + CREATE_CL_MEM(y, M, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasSgemv(clblasRowMajor, clTransA, M, N, alpha, + ARRAY(A), ARRAY(x), beta, ARRAY(y), + CLBLAS_TRAILING_ARGS)); +} + +template <> +void caffe_opencl_gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const double alpha, const double* A, const double* x, + const double beta, double* y) { + clblasTranspose clTransA = to_clblasTranspose(TransA); + int ldA = (TransA == CblasNoTrans) ? N : M; + int ldx = N; + int ldy = N; + CREATE_CL_MEM(A, M, N, READ_ONLY); + CREATE_CL_MEM(x, N, 1, READ_ONLY); + CREATE_CL_MEM(y, M, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasDgemv(clblasRowMajor, clTransA, M, N, alpha, + ARRAY(A), ARRAY(x), beta, ARRAY(y), + CLBLAS_TRAILING_ARGS)); +} + +template <> +void caffe_opencl_axpy(const int N, const float alpha, const float* X, + float* Y) { + int ldX = N; + int ldY = N; + CREATE_CL_MEM(X, N, 1, READ_ONLY); + CREATE_CL_MEM(Y, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasSaxpy( + N, alpha, ARRAY(X), ARRAY(Y), + CLBLAS_TRAILING_ARGS)); +} + +template <> +void caffe_opencl_axpy(const int N, const double alpha, const double* X, + double* Y) { + int ldX = N; + int ldY = N; + CREATE_CL_MEM(X, N, 1, READ_ONLY); + CREATE_CL_MEM(Y, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasDaxpy( + N, alpha, ARRAY(X), ARRAY(Y), + CLBLAS_TRAILING_ARGS)); +} + +template <> +void caffe_opencl_copy(const int N, const float* X, float* Y) { + int ldX = N; + int ldY = N; + CREATE_CL_MEM(X, N, 1, READ_ONLY); + CREATE_CL_MEM(Y, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasScopy( + N, ARRAY(X), ARRAY(Y), + CLBLAS_TRAILING_ARGS)); +} + +template <> +void caffe_opencl_copy(const int N, const double* X, double* Y) { + int ldX = N; + int ldY = N; + CREATE_CL_MEM(X, N, 1, READ_ONLY); + CREATE_CL_MEM(Y, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasDcopy( + N, ARRAY(X), ARRAY(Y), + CLBLAS_TRAILING_ARGS)); +} + +template <> +void caffe_opencl_scal(const int N, const float alpha, float *X) { + int ldX = N; + CREATE_CL_MEM(X, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasSscal( + N, alpha, ARRAY(X), + CLBLAS_TRAILING_ARGS)); +} + +template <> +void caffe_opencl_scal(const int N, const double alpha, double *X) { + int ldX = N; + CREATE_CL_MEM(X, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasDscal( + N, alpha, ARRAY(X), + CLBLAS_TRAILING_ARGS)); +} + +template <> +void caffe_opencl_axpby(const int N, const float alpha, const float* X, + const float beta, float* Y) { + caffe_opencl_scal(N, beta, Y); + caffe_opencl_axpy(N, alpha, X, Y); +} + +template <> +void caffe_opencl_axpby(const int N, const double alpha, const double* X, + const double beta, double* Y) { + caffe_opencl_scal(N, beta, Y); + caffe_opencl_axpy(N, alpha, X, Y); +} + +template +void caffe_opencl_copy_from_cpu(const int N, const Dtype *X, Dtype *Y) { + CREATE_CL_MEM(Y, N, 1, READ_WRITE); + cl_bool blocking_write = CL_TRUE; + cl_uint num_events_in_wait_list = 0; + cl_event *event_wait_list = NULL; + cl_event events = NULL; + CL_CHECK(clEnqueueWriteBuffer( + CaffeOpenCL::queue(), bufY, blocking_write, 0, N * sizeof(Dtype), + X, num_events_in_wait_list, event_wait_list, &events)); +} + +template +void caffe_opencl_copy_from_cpu(const int N, const float *X, float *Y); +template +void caffe_opencl_copy_from_cpu(const int N, const double *X, double *Y); + + +template +void caffe_opencl_set(const int N, const Dtype alpha, Dtype *X) { +#ifdef CL_VERSION_1_2 + CREATE_CL_MEM(X, N, 1, READ_WRITE); + cl_uint num_events_in_wait_list = 0; + cl_event *event_wait_list = NULL; + cl_event event = NULL; + CL_CHECK(clEnqueueFillBuffer( + CaffeOpenCL::queue(), bufX, static_cast(&alpha), sizeof(Dtype), + 0, sizeof(Dtype) * N, num_events_in_wait_list, event_wait_list, &event)); +#else + std::vector tmp(N, alpha); + caffe_opencl_copy_from_cpu(N, &tmp[0], X); +#endif +} + +template +void caffe_opencl_set(const int N, const float alpha, float *X); +template +void caffe_opencl_set(const int N, const double alpha, double *X); + + +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sqr, y[i] = x[i] * x[i]); +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(exp, y[i] = exp(x[i])); +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sign, y[i] = sign(x[i])); +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sgnbit, y[i] = signbit(x[i])); +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(fabs, y[i] = fabs(x[i])); + +DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(add, y[i] = a[i] + b[i]); +DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(sub, y[i] = a[i] - b[i]); +DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(mul, y[i] = a[i] * b[i]); +DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(div, y[i] = a[i] / b[i]); + +} // namespace caffe + +#endif // USE_OPENCL