From c94196989a7b8e73c13eef109b29597d45800df2 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Wed, 14 May 2014 16:18:26 +0800 Subject: [PATCH 01/75] Wrap the CPU and GPU math functions in math backend classes --- include/caffe/util/math_backends.hpp | 228 +++++++++++++++++++++++++++ src/caffe/util/cpu_math_backends.cpp | 152 ++++++++++++++++++ src/caffe/util/gpu_math_backends.cpp | 156 ++++++++++++++++++ src/caffe/util/math_backends.cpp | 29 ++++ 4 files changed, 565 insertions(+) create mode 100644 include/caffe/util/math_backends.hpp create mode 100644 src/caffe/util/cpu_math_backends.cpp create mode 100644 src/caffe/util/gpu_math_backends.cpp create mode 100644 src/caffe/util/math_backends.cpp diff --git a/include/caffe/util/math_backends.hpp b/include/caffe/util/math_backends.hpp new file mode 100644 index 00000000000..a01f0c89c0c --- /dev/null +++ b/include/caffe/util/math_backends.hpp @@ -0,0 +1,228 @@ +// Copyright 2014 BVLC and contributors. + +#ifndef CAFFE_UTIL_MATH_BACKENDS_H_ +#define CAFFE_UTIL_MATH_BACKENDS_H_ + +#include +#include + +#include "glog/logging.h" + +#include "caffe/util/math_functions.hpp" + +namespace caffe { + +template +class MathBackend { +public: + virtual ~MathBackend() {} + virtual void gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C) = 0; + + virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, + Dtype* y) = 0; + + virtual void axpy(const int N, const Dtype alpha, const Dtype* X, + Dtype* Y) = 0; + + virtual void axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y) = 0; + + virtual void copy(const int N, const Dtype *X, Dtype *Y) = 0; + + virtual void set(const int N, const Dtype alpha, Dtype *X) = 0; + + virtual void add_scalar(const int N, const Dtype alpha, Dtype *X) = 0; + + virtual void scal(const int N, const Dtype alpha, Dtype *X) = 0; + + virtual void sqr(const int N, const Dtype* a, Dtype* y) = 0; + + virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y) = 0; + + virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y) = 0; + + virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y) = 0; + + virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y) = 0; + + virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y) = 0; + + virtual void rng_uniform(const int N, const Dtype a, const Dtype b, + Dtype* r) = 0; + + virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma, + Dtype* r) = 0; + + virtual void rng_bernoulli(const int N, const Dtype p, int* r) = 0; + + virtual void exp(const int N, const Dtype* a, Dtype* y) = 0; + + virtual void dot(const int N, const Dtype* x, const Dtype* y, + Dtype* out) = 0; + + virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y, + uint32_t* out) = 0; + +// Returns the sum of the absolute values of the elements of vector x + virtual void asum(const int N, const Dtype* x, Dtype* y) = 0; + + virtual void sign(const int N, const Dtype* x, Dtype* y) = 0; + + virtual void sgnbit(const int N, const Dtype* x, Dtype* y) = 0; + + virtual void fabs(const int N, const Dtype* x, Dtype* y) = 0; + + virtual void scale(const int N, const Dtype alpha, const Dtype *x, + Dtype* y) = 0; +}; + +template +class CPUMathBackend: public MathBackend { +public: + CPUMathBackend() {} + virtual ~CPUMathBackend() {} + virtual void gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C); + + virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, + Dtype* y); + + virtual void axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y); + + virtual void axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y); + + virtual void copy(const int N, const Dtype *X, Dtype *Y); + + virtual void set(const int N, const Dtype alpha, Dtype *X); + + virtual void add_scalar(const int N, const Dtype alpha, Dtype *X); + + virtual void scal(const int N, const Dtype alpha, Dtype *X); + + virtual void sqr(const int N, const Dtype* a, Dtype* y); + + virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y); + + virtual void rng_uniform(const int N, const Dtype a, const Dtype b, + Dtype* r); + + virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma, + Dtype* r); + + virtual void rng_bernoulli(const int N, const Dtype p, int* r); + + virtual void exp(const int N, const Dtype* a, Dtype* y); + + virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out); + + virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y, + uint32_t* out); + +// Returns the sum of the absolute values of the elements of vector x + virtual void asum(const int N, const Dtype* x, Dtype* y); + + virtual void sign(const int N, const Dtype* x, Dtype* y); + + virtual void sgnbit(const int N, const Dtype* x, Dtype* y); + + virtual void fabs(const int N, const Dtype* x, Dtype* y); + + virtual void scale(const int N, const Dtype alpha, const Dtype *x, + Dtype* y); +}; + +template +class GPUMathBackend: public MathBackend { +public: + GPUMathBackend() {} + virtual ~GPUMathBackend() {} + virtual void gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C); + + virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, + Dtype* y); + + virtual void axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y); + + virtual void axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y); + + virtual void copy(const int N, const Dtype *X, Dtype *Y); + + virtual void set(const int N, const Dtype alpha, Dtype *X); + + virtual void add_scalar(const int N, const Dtype alpha, Dtype *X); + + virtual void scal(const int N, const Dtype alpha, Dtype *X); + + virtual void sqr(const int N, const Dtype* a, Dtype* y); + + virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y); + + virtual void rng_uniform(const int N, const Dtype a, const Dtype b, + Dtype* r); + + virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma, + Dtype* r); + + virtual void rng_bernoulli(const int N, const Dtype p, int* r); + + virtual void exp(const int N, const Dtype* a, Dtype* y); + + virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out); + + virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y, + uint32_t* out); + +// Returns the sum of the absolute values of the elements of vector x + virtual void asum(const int N, const Dtype* x, Dtype* y); + + virtual void sign(const int N, const Dtype* x, Dtype* y); + + virtual void sgnbit(const int N, const Dtype* x, Dtype* y); + + virtual void fabs(const int N, const Dtype* x, Dtype* y); + + virtual void scale(const int N, const Dtype alpha, const Dtype *x, + Dtype* y); +}; + +template +class MathBackendFactory { +public: + MathBackend* GetInstance(); +private: + static MathBackend* cpu_math_backend_; + static MathBackend* gpu_math_backend_; +}; + +} // namespace caffe + +#endif // CAFFE_UTIL_MATH_BACKENDS_H_ diff --git a/src/caffe/util/cpu_math_backends.cpp b/src/caffe/util/cpu_math_backends.cpp new file mode 100644 index 00000000000..4a7bf634ee2 --- /dev/null +++ b/src/caffe/util/cpu_math_backends.cpp @@ -0,0 +1,152 @@ +// Copyright 2014 BVLC and contributors. + +#include "caffe/common.hpp" +#include "caffe/util/math_backends.hpp" + +namespace caffe { +template +void CPUMathBackend::gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C) { + caffe_cpu_gemm(TransA, TransB, M, N, K, alpha, A, B, beta, C); +} + +template +void CPUMathBackend::gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const Dtype alpha, const Dtype* A, const Dtype* x, + const Dtype beta, Dtype* y) { + caffe_cpu_gemv(TransA, M, N, alpha, A, x, beta, y); +} + +template +void CPUMathBackend::axpy(const int N, const Dtype alpha, const Dtype* X, + Dtype* Y) { + caffe_axpy(N, alpha, X, Y); +} + +template +void CPUMathBackend::axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y) { + caffe_cpu_axpby(N, alpha, X, beta, Y); +} + +template +void CPUMathBackend::copy(const int N, const Dtype *X, Dtype *Y) { + caffe_copy(N, X, Y); +} + +template +void CPUMathBackend::set(const int N, const Dtype alpha, Dtype *X) { + caffe_set(N, alpha, X); +} + +template +void CPUMathBackend::add_scalar(const int N, const Dtype alpha, Dtype *X) { + caffe_add_scalar(N, alpha, X); +} + +template +void CPUMathBackend::scal(const int N, const Dtype alpha, Dtype *X) { + caffe_scal(N, alpha, X); +} + +template +void CPUMathBackend::sqr(const int N, const Dtype* a, Dtype* y) { + caffe_sqr(N, a, y); +} + +template +void CPUMathBackend::add(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + caffe_add(N, a, b, y); +} + +template +void CPUMathBackend::sub(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + caffe_sub(N, a, b, y); +} + +template +void CPUMathBackend::mul(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + caffe_mul(N, a, b, y); +} + +template +void CPUMathBackend::div(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + caffe_div(N, a, b, y); +} + +template +void CPUMathBackend::powx(const int N, const Dtype* a, const Dtype b, + Dtype* y) { + caffe_powx(N, a, b, y); +} + +template +void CPUMathBackend::rng_uniform(const int N, const Dtype a, const Dtype b, + Dtype* r) { + caffe_rng_uniform(N, a, b, r); +} + +template +void CPUMathBackend::rng_gaussian(const int N, const Dtype mu, + const Dtype sigma, Dtype* r) { + caffe_rng_gaussian(N, mu, sigma, r); +} + +template +void CPUMathBackend::rng_bernoulli(const int N, const Dtype p, int* r) { + caffe_rng_bernoulli(N, p, r); +} + +template +void CPUMathBackend::exp(const int N, const Dtype* a, Dtype* y) { + caffe_exp(N, a, y); +} + +template +void CPUMathBackend::dot(const int N, const Dtype* x, const Dtype* y, + Dtype* out) { + *out = caffe_cpu_dot(N, x, y); +} + +template +void CPUMathBackend::hamming_distance(const int N, const Dtype* x, + const Dtype* y, uint32_t* out) { + *out = caffe_cpu_hamming_distance(N, x, y); +} + +template +// Returns the sum of the absolute values of the elements of vector x +void CPUMathBackend::asum(const int N, const Dtype* x, Dtype* y) { + *y = caffe_cpu_asum(N, x); +} + +template +void CPUMathBackend::sign(const int N, const Dtype* x, Dtype* y) { + caffe_cpu_sign(N, x, y); +} + +template +void CPUMathBackend::sgnbit(const int N, const Dtype* x, Dtype* y) { + caffe_gpu_sgnbit(N, x, y); +} + +template +void CPUMathBackend::fabs(const int N, const Dtype* x, Dtype* y) { + caffe_cpu_fabs(N, x, y); +} + +template +void CPUMathBackend::scale(const int N, const Dtype alpha, const Dtype *x, + Dtype* y) { + caffe_cpu_scale(N, alpha, x, y); +} + +INSTANTIATE_CLASS(CPUMathBackend); + +} // namespace caffe diff --git a/src/caffe/util/gpu_math_backends.cpp b/src/caffe/util/gpu_math_backends.cpp new file mode 100644 index 00000000000..6d22046943c --- /dev/null +++ b/src/caffe/util/gpu_math_backends.cpp @@ -0,0 +1,156 @@ +// Copyright 2014 BVLC and contributors. + +#include "caffe/common.hpp" +#include "caffe/util/math_backends.hpp" + +namespace caffe { +template +void GPUMathBackend::gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C) { + caffe_gpu_gemm(TransA, TransB, M, N, K, alpha, A, B, beta, C); +} + +template +void GPUMathBackend::gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const Dtype alpha, const Dtype* A, const Dtype* x, + const Dtype beta, Dtype* y) { + caffe_gpu_gemv(TransA, M, N, alpha, A, x, beta, y); +} + +template +void GPUMathBackend::axpy(const int N, const Dtype alpha, const Dtype* X, + Dtype* Y) { + caffe_gpu_axpy(N, alpha, X, Y); +} + +template +void GPUMathBackend::axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y) { + caffe_gpu_axpby(N, alpha, X, beta, Y); +} + +template +void GPUMathBackend::copy(const int N, const Dtype *X, Dtype *Y) { + caffe_gpu_copy(N, X, Y); +} + +template +void GPUMathBackend::set(const int N, const Dtype alpha, Dtype *X) { + caffe_gpu_set(N, alpha, X); +} + +template +void GPUMathBackend::add_scalar(const int N, const Dtype alpha, Dtype *X) { + caffe_gpu_add_scalar(N, alpha, X); +} + +template +void GPUMathBackend::scal(const int N, const Dtype alpha, Dtype *X) { + caffe_gpu_scal(N, alpha, X); +} + +template +void GPUMathBackend::sqr(const int N, const Dtype* a, Dtype* y) { + NOT_IMPLEMENTED; +// caffe_gpu_sqr(N, a, y); +} + +template +void GPUMathBackend::add(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + NOT_IMPLEMENTED; +// caffe_gpu_add(N, a, b, y); +} + +template +void GPUMathBackend::sub(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + NOT_IMPLEMENTED; +// caffe_gpu_sub(N, a, b, y); +} + +template +void GPUMathBackend::mul(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + caffe_gpu_mul(N, a, b, y); +} + +template +void GPUMathBackend::div(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + caffe_gpu_div(N, a, b, y); +} + +template +void GPUMathBackend::powx(const int N, const Dtype* a, const Dtype b, + Dtype* y) { + caffe_gpu_powx(N, a, b, y); +} + +template +void GPUMathBackend::rng_uniform(const int N, const Dtype a, const Dtype b, + Dtype* r) { + caffe_gpu_rng_uniform(N, a, b, r); +} + +template +void GPUMathBackend::rng_gaussian(const int N, const Dtype mu, + const Dtype sigma, Dtype* r) { + caffe_gpu_rng_gaussian(N, mu, sigma, r); +} + +template +void GPUMathBackend::rng_bernoulli(const int N, const Dtype p, int* r) { + caffe_gpu_rng_bernoulli(N, p, r); +} + +template +void GPUMathBackend::exp(const int N, const Dtype* a, Dtype* y) { + NOT_IMPLEMENTED; +// caffe_gpu_exp(N, a, y); +} + +template +void GPUMathBackend::dot(const int N, const Dtype* x, const Dtype* y, + Dtype* out) { + caffe_gpu_dot(N, x, y, out); +} + +template +void GPUMathBackend::hamming_distance(const int N, const Dtype* x, + const Dtype* y, uint32_t* out) { + *out = caffe_gpu_hamming_distance(N, x, y); +} + +template +// Returns the sum of the absolute values of the elements of vector x +void GPUMathBackend::asum(const int N, const Dtype* x, Dtype* y) { + caffe_gpu_asum(N, x, y); +} + +template +void GPUMathBackend::sign(const int N, const Dtype* x, Dtype* y) { + caffe_gpu_sign(N, x, y); +} + +template +void GPUMathBackend::sgnbit(const int N, const Dtype* x, Dtype* y) { + caffe_gpu_sgnbit(N, x, y); +} + +template +void GPUMathBackend::fabs(const int N, const Dtype* x, Dtype* y) { + caffe_gpu_fabs(N, x, y); +} + +template +void GPUMathBackend::scale(const int N, const Dtype alpha, const Dtype *x, + Dtype* y) { + caffe_gpu_scale(N, alpha, x, y); +} + +INSTANTIATE_CLASS(GPUMathBackend); + +} // namespace caffe diff --git a/src/caffe/util/math_backends.cpp b/src/caffe/util/math_backends.cpp new file mode 100644 index 00000000000..698254ed01a --- /dev/null +++ b/src/caffe/util/math_backends.cpp @@ -0,0 +1,29 @@ +// Copyright 2014 BVLC and contributors. + +#include "caffe/common.hpp" +#include "caffe/util/math_backends.hpp" + +namespace caffe { + +template +MathBackend* MathBackendFactory::GetInstance() { + switch (Caffe::mode()) { + case Caffe::CPU: + return cpu_math_backend_; + case Caffe::GPU: + return gpu_math_backend_; + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast*>(NULL); + } +} +template +MathBackend* MathBackendFactory::cpu_math_backend_ = + new CPUMathBackend(); +template +MathBackend* MathBackendFactory::gpu_math_backend_ = + new GPUMathBackend(); + +INSTANTIATE_CLASS(MathBackendFactory); + +} // namespace caffe From 540f1034970b52364659c939725be97fa3e745e2 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Wed, 14 May 2014 18:56:32 +0800 Subject: [PATCH 02/75] Add the math backend to the Layer base class --- include/caffe/layer.hpp | 6 +++++- include/caffe/util/math_backends.hpp | 2 +- src/caffe/util/gpu_math_backends.cpp | 3 ++- src/caffe/util/math_backends.cpp | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp index 690c36ba23f..5c6944dc63a 100644 --- a/include/caffe/layer.hpp +++ b/include/caffe/layer.hpp @@ -9,6 +9,7 @@ #include "caffe/blob.hpp" #include "caffe/common.hpp" #include "caffe/proto/caffe.pb.h" +#include "caffe/util/math_backends.hpp" using std::string; using std::vector; @@ -22,7 +23,7 @@ class Layer { // to SetUp(), where the dimensions of the bottom blobs are provided to the // layer. explicit Layer(const LayerParameter& param) - : layer_param_(param) { + : layer_param_(param), math_(MathBackendFactory::GetMathBackend()) { // The only thing we do is to copy blobs if there are any. if (layer_param_.blobs_size() > 0) { blobs_.resize(layer_param_.blobs_size()); @@ -97,6 +98,9 @@ class Layer { LayerParameter layer_param_; // The vector that stores the parameters as a set of blobs. vector > > blobs_; + // The math backend abstracts the CPU and the GPU specific + // implementation details + MathBackend* math_; // Forward functions: compute the layer output // (and loss layers return the loss; other layers return the dummy value 0.) diff --git a/include/caffe/util/math_backends.hpp b/include/caffe/util/math_backends.hpp index a01f0c89c0c..92860187f48 100644 --- a/include/caffe/util/math_backends.hpp +++ b/include/caffe/util/math_backends.hpp @@ -217,7 +217,7 @@ class GPUMathBackend: public MathBackend { template class MathBackendFactory { public: - MathBackend* GetInstance(); + static MathBackend* GetMathBackend(); private: static MathBackend* cpu_math_backend_; static MathBackend* gpu_math_backend_; diff --git a/src/caffe/util/gpu_math_backends.cpp b/src/caffe/util/gpu_math_backends.cpp index 6d22046943c..c2f907e1469 100644 --- a/src/caffe/util/gpu_math_backends.cpp +++ b/src/caffe/util/gpu_math_backends.cpp @@ -103,7 +103,8 @@ void GPUMathBackend::rng_gaussian(const int N, const Dtype mu, template void GPUMathBackend::rng_bernoulli(const int N, const Dtype p, int* r) { - caffe_gpu_rng_bernoulli(N, p, r); + NOT_IMPLEMENTED; +// caffe_gpu_rng_bernoulli(N, p, r); } template diff --git a/src/caffe/util/math_backends.cpp b/src/caffe/util/math_backends.cpp index 698254ed01a..85cd7b8db9e 100644 --- a/src/caffe/util/math_backends.cpp +++ b/src/caffe/util/math_backends.cpp @@ -6,7 +6,7 @@ namespace caffe { template -MathBackend* MathBackendFactory::GetInstance() { +MathBackend* MathBackendFactory::GetMathBackend() { switch (Caffe::mode()) { case Caffe::CPU: return cpu_math_backend_; From ab5ffab9209e729e50a767111b563216ce858b73 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Wed, 14 May 2014 19:59:20 +0800 Subject: [PATCH 03/75] Add device type independent getters to Blob --- include/caffe/blob.hpp | 6 +++++ src/caffe/blob.cpp | 52 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp index c04375a10e2..30593388347 100644 --- a/include/caffe/blob.hpp +++ b/include/caffe/blob.hpp @@ -71,6 +71,12 @@ class Blob { Dtype* mutable_gpu_data(); Dtype* mutable_cpu_diff(); Dtype* mutable_gpu_diff(); + + const Dtype* const_data() const; + const Dtype* const_diff() const; + Dtype* mutable_data(); + Dtype* mutable_diff(); + void Update(); void FromProto(const BlobProto& proto); void ToProto(BlobProto* proto, bool write_diff = false) const; diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp index e603712fd82..91fd04f3ff2 100644 --- a/src/caffe/blob.cpp +++ b/src/caffe/blob.cpp @@ -96,6 +96,58 @@ Dtype* Blob::mutable_gpu_diff() { return reinterpret_cast(diff_->mutable_gpu_data()); } +template +const Dtype* Blob::const_data() const { + switch (Caffe::mode()) { + case Caffe::CPU: + return cpu_data(); + case Caffe::GPU: + return gpu_data(); + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast(NULL); + } +} + +template +const Dtype* Blob::const_diff() const { + switch (Caffe::mode()) { + case Caffe::CPU: + return cpu_diff(); + case Caffe::GPU: + return gpu_diff(); + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast(NULL); + } +} + +template +Dtype* Blob::mutable_data() { + switch (Caffe::mode()) { + case Caffe::CPU: + return mutable_cpu_data(); + case Caffe::GPU: + return mutable_gpu_data(); + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast(NULL); + } +} + +template +Dtype* Blob::mutable_diff() { + switch (Caffe::mode()) { + case Caffe::CPU: + return mutable_cpu_diff(); + case Caffe::GPU: + return mutable_gpu_diff(); + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast(NULL); + } +} + template void Blob::ShareData(const Blob& other) { CHECK_EQ(count_, other.count()); From e347813f1fb89bf723bb486261b971770ea72252 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Thu, 15 May 2014 13:43:18 +0800 Subject: [PATCH 04/75] Remove tab from the code and reformat using google style --- include/caffe/util/math_backends.hpp | 242 +++++++++++++-------------- src/caffe/util/cpu_math_backends.cpp | 94 ++++++----- src/caffe/util/gpu_math_backends.cpp | 94 ++++++----- src/caffe/util/math_backends.cpp | 22 +-- 4 files changed, 227 insertions(+), 225 deletions(-) diff --git a/include/caffe/util/math_backends.hpp b/include/caffe/util/math_backends.hpp index 92860187f48..1fc78161680 100644 --- a/include/caffe/util/math_backends.hpp +++ b/include/caffe/util/math_backends.hpp @@ -14,213 +14,211 @@ namespace caffe { template class MathBackend { -public: - virtual ~MathBackend() {} - virtual void gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, - Dtype* C) = 0; + public: + virtual ~MathBackend() { + } + virtual void gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const Dtype alpha, + const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C) = 0; - virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, - const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, - Dtype* y) = 0; + virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, const Dtype* x, + const Dtype beta, Dtype* y) = 0; - virtual void axpy(const int N, const Dtype alpha, const Dtype* X, - Dtype* Y) = 0; + virtual void axpy(const int N, const Dtype alpha, const Dtype* X, + Dtype* Y) = 0; - virtual void axpby(const int N, const Dtype alpha, const Dtype* X, - const Dtype beta, Dtype* Y) = 0; + virtual void axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y) = 0; - virtual void copy(const int N, const Dtype *X, Dtype *Y) = 0; + virtual void copy(const int N, const Dtype *X, Dtype *Y) = 0; - virtual void set(const int N, const Dtype alpha, Dtype *X) = 0; + virtual void set(const int N, const Dtype alpha, Dtype *X) = 0; - virtual void add_scalar(const int N, const Dtype alpha, Dtype *X) = 0; + virtual void add_scalar(const int N, const Dtype alpha, Dtype *X) = 0; - virtual void scal(const int N, const Dtype alpha, Dtype *X) = 0; + virtual void scal(const int N, const Dtype alpha, Dtype *X) = 0; - virtual void sqr(const int N, const Dtype* a, Dtype* y) = 0; + virtual void sqr(const int N, const Dtype* a, Dtype* y) = 0; - virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y) = 0; + virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y) = 0; - virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y) = 0; + virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y) = 0; - virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y) = 0; + virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y) = 0; - virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y) = 0; + virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y) = 0; - virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y) = 0; + virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y) = 0; - virtual void rng_uniform(const int N, const Dtype a, const Dtype b, - Dtype* r) = 0; + virtual void rng_uniform(const int N, const Dtype a, const Dtype b, + Dtype* r) = 0; - virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma, - Dtype* r) = 0; + virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma, + Dtype* r) = 0; - virtual void rng_bernoulli(const int N, const Dtype p, int* r) = 0; + virtual void rng_bernoulli(const int N, const Dtype p, int* r) = 0; - virtual void exp(const int N, const Dtype* a, Dtype* y) = 0; + virtual void exp(const int N, const Dtype* a, Dtype* y) = 0; - virtual void dot(const int N, const Dtype* x, const Dtype* y, - Dtype* out) = 0; + virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out) = 0; - virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y, - uint32_t* out) = 0; + virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y, + uint32_t* out) = 0; // Returns the sum of the absolute values of the elements of vector x - virtual void asum(const int N, const Dtype* x, Dtype* y) = 0; + virtual void asum(const int N, const Dtype* x, Dtype* y) = 0; - virtual void sign(const int N, const Dtype* x, Dtype* y) = 0; + virtual void sign(const int N, const Dtype* x, Dtype* y) = 0; - virtual void sgnbit(const int N, const Dtype* x, Dtype* y) = 0; + virtual void sgnbit(const int N, const Dtype* x, Dtype* y) = 0; - virtual void fabs(const int N, const Dtype* x, Dtype* y) = 0; + virtual void fabs(const int N, const Dtype* x, Dtype* y) = 0; - virtual void scale(const int N, const Dtype alpha, const Dtype *x, - Dtype* y) = 0; + virtual void scale(const int N, const Dtype alpha, const Dtype *x, + Dtype* y) = 0; }; template -class CPUMathBackend: public MathBackend { -public: - CPUMathBackend() {} - virtual ~CPUMathBackend() {} - virtual void gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, - Dtype* C); +class CPUMathBackend : public MathBackend { + public: + CPUMathBackend() { + } + virtual ~CPUMathBackend() { + } + virtual void gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const Dtype alpha, + const Dtype* A, const Dtype* B, const Dtype beta, Dtype* C); - virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, - const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, - Dtype* y); + virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, const Dtype* x, + const Dtype beta, Dtype* y); - virtual void axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y); + virtual void axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y); - virtual void axpby(const int N, const Dtype alpha, const Dtype* X, - const Dtype beta, Dtype* Y); + virtual void axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y); - virtual void copy(const int N, const Dtype *X, Dtype *Y); + virtual void copy(const int N, const Dtype *X, Dtype *Y); - virtual void set(const int N, const Dtype alpha, Dtype *X); + virtual void set(const int N, const Dtype alpha, Dtype *X); - virtual void add_scalar(const int N, const Dtype alpha, Dtype *X); + virtual void add_scalar(const int N, const Dtype alpha, Dtype *X); - virtual void scal(const int N, const Dtype alpha, Dtype *X); + virtual void scal(const int N, const Dtype alpha, Dtype *X); - virtual void sqr(const int N, const Dtype* a, Dtype* y); + virtual void sqr(const int N, const Dtype* a, Dtype* y); - virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y); + virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y); - virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); + virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); - virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); + virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); - virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y); + virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y); - virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y); + virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y); - virtual void rng_uniform(const int N, const Dtype a, const Dtype b, - Dtype* r); + virtual void rng_uniform(const int N, const Dtype a, const Dtype b, Dtype* r); - virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma, - Dtype* r); + virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma, + Dtype* r); - virtual void rng_bernoulli(const int N, const Dtype p, int* r); + virtual void rng_bernoulli(const int N, const Dtype p, int* r); - virtual void exp(const int N, const Dtype* a, Dtype* y); + virtual void exp(const int N, const Dtype* a, Dtype* y); - virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out); + virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out); - virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y, - uint32_t* out); + virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y, + uint32_t* out); // Returns the sum of the absolute values of the elements of vector x - virtual void asum(const int N, const Dtype* x, Dtype* y); + virtual void asum(const int N, const Dtype* x, Dtype* y); - virtual void sign(const int N, const Dtype* x, Dtype* y); + virtual void sign(const int N, const Dtype* x, Dtype* y); - virtual void sgnbit(const int N, const Dtype* x, Dtype* y); + virtual void sgnbit(const int N, const Dtype* x, Dtype* y); - virtual void fabs(const int N, const Dtype* x, Dtype* y); + virtual void fabs(const int N, const Dtype* x, Dtype* y); - virtual void scale(const int N, const Dtype alpha, const Dtype *x, - Dtype* y); + virtual void scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y); }; template -class GPUMathBackend: public MathBackend { -public: - GPUMathBackend() {} - virtual ~GPUMathBackend() {} - virtual void gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, - Dtype* C); +class GPUMathBackend : public MathBackend { + public: + GPUMathBackend() { + } + virtual ~GPUMathBackend() { + } + virtual void gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const Dtype alpha, + const Dtype* A, const Dtype* B, const Dtype beta, Dtype* C); - virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, - const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, - Dtype* y); + virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, const Dtype* x, + const Dtype beta, Dtype* y); - virtual void axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y); + virtual void axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y); - virtual void axpby(const int N, const Dtype alpha, const Dtype* X, - const Dtype beta, Dtype* Y); + virtual void axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y); - virtual void copy(const int N, const Dtype *X, Dtype *Y); + virtual void copy(const int N, const Dtype *X, Dtype *Y); - virtual void set(const int N, const Dtype alpha, Dtype *X); + virtual void set(const int N, const Dtype alpha, Dtype *X); - virtual void add_scalar(const int N, const Dtype alpha, Dtype *X); + virtual void add_scalar(const int N, const Dtype alpha, Dtype *X); - virtual void scal(const int N, const Dtype alpha, Dtype *X); + virtual void scal(const int N, const Dtype alpha, Dtype *X); - virtual void sqr(const int N, const Dtype* a, Dtype* y); + virtual void sqr(const int N, const Dtype* a, Dtype* y); - virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y); + virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y); - virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); + virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); - virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); + virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); - virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y); + virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y); - virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y); + virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y); - virtual void rng_uniform(const int N, const Dtype a, const Dtype b, - Dtype* r); + virtual void rng_uniform(const int N, const Dtype a, const Dtype b, Dtype* r); - virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma, - Dtype* r); + virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma, + Dtype* r); - virtual void rng_bernoulli(const int N, const Dtype p, int* r); + virtual void rng_bernoulli(const int N, const Dtype p, int* r); - virtual void exp(const int N, const Dtype* a, Dtype* y); + virtual void exp(const int N, const Dtype* a, Dtype* y); - virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out); + virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out); - virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y, - uint32_t* out); + virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y, + uint32_t* out); // Returns the sum of the absolute values of the elements of vector x - virtual void asum(const int N, const Dtype* x, Dtype* y); + virtual void asum(const int N, const Dtype* x, Dtype* y); - virtual void sign(const int N, const Dtype* x, Dtype* y); + virtual void sign(const int N, const Dtype* x, Dtype* y); - virtual void sgnbit(const int N, const Dtype* x, Dtype* y); + virtual void sgnbit(const int N, const Dtype* x, Dtype* y); - virtual void fabs(const int N, const Dtype* x, Dtype* y); + virtual void fabs(const int N, const Dtype* x, Dtype* y); - virtual void scale(const int N, const Dtype alpha, const Dtype *x, - Dtype* y); + virtual void scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y); }; template class MathBackendFactory { -public: - static MathBackend* GetMathBackend(); -private: - static MathBackend* cpu_math_backend_; - static MathBackend* gpu_math_backend_; + public: + static MathBackend* GetMathBackend(); + private: + static MathBackend* cpu_math_backend_; + static MathBackend* gpu_math_backend_; }; } // namespace caffe diff --git a/src/caffe/util/cpu_math_backends.cpp b/src/caffe/util/cpu_math_backends.cpp index 4a7bf634ee2..8ec25c33bb6 100644 --- a/src/caffe/util/cpu_math_backends.cpp +++ b/src/caffe/util/cpu_math_backends.cpp @@ -6,145 +6,147 @@ namespace caffe { template void CPUMathBackend::gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, - Dtype* C) { - caffe_cpu_gemm(TransA, TransB, M, N, K, alpha, A, B, beta, C); + const CBLAS_TRANSPOSE TransB, const int M, + const int N, const int K, const Dtype alpha, + const Dtype* A, const Dtype* B, + const Dtype beta, Dtype* C) { + caffe_cpu_gemm(TransA, TransB, M, N, K, alpha, A, B, beta, C); } template void CPUMathBackend::gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const Dtype alpha, const Dtype* A, const Dtype* x, - const Dtype beta, Dtype* y) { - caffe_cpu_gemv(TransA, M, N, alpha, A, x, beta, y); + const int N, const Dtype alpha, const Dtype* A, + const Dtype* x, const Dtype beta, Dtype* y) { + caffe_cpu_gemv(TransA, M, N, alpha, A, x, beta, y); } template void CPUMathBackend::axpy(const int N, const Dtype alpha, const Dtype* X, - Dtype* Y) { - caffe_axpy(N, alpha, X, Y); + Dtype* Y) { + caffe_axpy(N, alpha, X, Y); } template -void CPUMathBackend::axpby(const int N, const Dtype alpha, const Dtype* X, - const Dtype beta, Dtype* Y) { - caffe_cpu_axpby(N, alpha, X, beta, Y); +void CPUMathBackend::axpby(const int N, const Dtype alpha, + const Dtype* X, const Dtype beta, Dtype* Y) { + caffe_cpu_axpby(N, alpha, X, beta, Y); } template void CPUMathBackend::copy(const int N, const Dtype *X, Dtype *Y) { - caffe_copy(N, X, Y); + caffe_copy(N, X, Y); } template void CPUMathBackend::set(const int N, const Dtype alpha, Dtype *X) { - caffe_set(N, alpha, X); + caffe_set(N, alpha, X); } template -void CPUMathBackend::add_scalar(const int N, const Dtype alpha, Dtype *X) { - caffe_add_scalar(N, alpha, X); +void CPUMathBackend::add_scalar(const int N, const Dtype alpha, + Dtype *X) { + caffe_add_scalar(N, alpha, X); } template void CPUMathBackend::scal(const int N, const Dtype alpha, Dtype *X) { - caffe_scal(N, alpha, X); + caffe_scal(N, alpha, X); } template void CPUMathBackend::sqr(const int N, const Dtype* a, Dtype* y) { - caffe_sqr(N, a, y); + caffe_sqr(N, a, y); } template void CPUMathBackend::add(const int N, const Dtype* a, const Dtype* b, - Dtype* y) { - caffe_add(N, a, b, y); + Dtype* y) { + caffe_add(N, a, b, y); } template void CPUMathBackend::sub(const int N, const Dtype* a, const Dtype* b, - Dtype* y) { - caffe_sub(N, a, b, y); + Dtype* y) { + caffe_sub(N, a, b, y); } template void CPUMathBackend::mul(const int N, const Dtype* a, const Dtype* b, - Dtype* y) { - caffe_mul(N, a, b, y); + Dtype* y) { + caffe_mul(N, a, b, y); } template void CPUMathBackend::div(const int N, const Dtype* a, const Dtype* b, - Dtype* y) { - caffe_div(N, a, b, y); + Dtype* y) { + caffe_div(N, a, b, y); } template void CPUMathBackend::powx(const int N, const Dtype* a, const Dtype b, - Dtype* y) { - caffe_powx(N, a, b, y); + Dtype* y) { + caffe_powx(N, a, b, y); } template -void CPUMathBackend::rng_uniform(const int N, const Dtype a, const Dtype b, - Dtype* r) { - caffe_rng_uniform(N, a, b, r); +void CPUMathBackend::rng_uniform(const int N, const Dtype a, + const Dtype b, Dtype* r) { + caffe_rng_uniform(N, a, b, r); } template void CPUMathBackend::rng_gaussian(const int N, const Dtype mu, - const Dtype sigma, Dtype* r) { - caffe_rng_gaussian(N, mu, sigma, r); + const Dtype sigma, Dtype* r) { + caffe_rng_gaussian(N, mu, sigma, r); } template void CPUMathBackend::rng_bernoulli(const int N, const Dtype p, int* r) { - caffe_rng_bernoulli(N, p, r); + caffe_rng_bernoulli(N, p, r); } template void CPUMathBackend::exp(const int N, const Dtype* a, Dtype* y) { - caffe_exp(N, a, y); + caffe_exp(N, a, y); } template void CPUMathBackend::dot(const int N, const Dtype* x, const Dtype* y, - Dtype* out) { - *out = caffe_cpu_dot(N, x, y); + Dtype* out) { + *out = caffe_cpu_dot(N, x, y); } template void CPUMathBackend::hamming_distance(const int N, const Dtype* x, - const Dtype* y, uint32_t* out) { - *out = caffe_cpu_hamming_distance(N, x, y); + const Dtype* y, uint32_t* out) { + *out = caffe_cpu_hamming_distance(N, x, y); } template // Returns the sum of the absolute values of the elements of vector x void CPUMathBackend::asum(const int N, const Dtype* x, Dtype* y) { - *y = caffe_cpu_asum(N, x); + *y = caffe_cpu_asum(N, x); } template void CPUMathBackend::sign(const int N, const Dtype* x, Dtype* y) { - caffe_cpu_sign(N, x, y); + caffe_cpu_sign(N, x, y); } template void CPUMathBackend::sgnbit(const int N, const Dtype* x, Dtype* y) { - caffe_gpu_sgnbit(N, x, y); + caffe_gpu_sgnbit(N, x, y); } template void CPUMathBackend::fabs(const int N, const Dtype* x, Dtype* y) { - caffe_cpu_fabs(N, x, y); + caffe_cpu_fabs(N, x, y); } template -void CPUMathBackend::scale(const int N, const Dtype alpha, const Dtype *x, - Dtype* y) { - caffe_cpu_scale(N, alpha, x, y); +void CPUMathBackend::scale(const int N, const Dtype alpha, + const Dtype *x, Dtype* y) { + caffe_cpu_scale(N, alpha, x, y); } INSTANTIATE_CLASS(CPUMathBackend); diff --git a/src/caffe/util/gpu_math_backends.cpp b/src/caffe/util/gpu_math_backends.cpp index c2f907e1469..5521da9c11c 100644 --- a/src/caffe/util/gpu_math_backends.cpp +++ b/src/caffe/util/gpu_math_backends.cpp @@ -6,150 +6,152 @@ namespace caffe { template void GPUMathBackend::gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, - Dtype* C) { - caffe_gpu_gemm(TransA, TransB, M, N, K, alpha, A, B, beta, C); + const CBLAS_TRANSPOSE TransB, const int M, + const int N, const int K, const Dtype alpha, + const Dtype* A, const Dtype* B, + const Dtype beta, Dtype* C) { + caffe_gpu_gemm(TransA, TransB, M, N, K, alpha, A, B, beta, C); } template void GPUMathBackend::gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const Dtype alpha, const Dtype* A, const Dtype* x, - const Dtype beta, Dtype* y) { - caffe_gpu_gemv(TransA, M, N, alpha, A, x, beta, y); + const int N, const Dtype alpha, const Dtype* A, + const Dtype* x, const Dtype beta, Dtype* y) { + caffe_gpu_gemv(TransA, M, N, alpha, A, x, beta, y); } template void GPUMathBackend::axpy(const int N, const Dtype alpha, const Dtype* X, - Dtype* Y) { - caffe_gpu_axpy(N, alpha, X, Y); + Dtype* Y) { + caffe_gpu_axpy(N, alpha, X, Y); } template -void GPUMathBackend::axpby(const int N, const Dtype alpha, const Dtype* X, - const Dtype beta, Dtype* Y) { - caffe_gpu_axpby(N, alpha, X, beta, Y); +void GPUMathBackend::axpby(const int N, const Dtype alpha, + const Dtype* X, const Dtype beta, Dtype* Y) { + caffe_gpu_axpby(N, alpha, X, beta, Y); } template void GPUMathBackend::copy(const int N, const Dtype *X, Dtype *Y) { - caffe_gpu_copy(N, X, Y); + caffe_gpu_copy(N, X, Y); } template void GPUMathBackend::set(const int N, const Dtype alpha, Dtype *X) { - caffe_gpu_set(N, alpha, X); + caffe_gpu_set(N, alpha, X); } template -void GPUMathBackend::add_scalar(const int N, const Dtype alpha, Dtype *X) { - caffe_gpu_add_scalar(N, alpha, X); +void GPUMathBackend::add_scalar(const int N, const Dtype alpha, + Dtype *X) { + caffe_gpu_add_scalar(N, alpha, X); } template void GPUMathBackend::scal(const int N, const Dtype alpha, Dtype *X) { - caffe_gpu_scal(N, alpha, X); + caffe_gpu_scal(N, alpha, X); } template void GPUMathBackend::sqr(const int N, const Dtype* a, Dtype* y) { - NOT_IMPLEMENTED; + NOT_IMPLEMENTED; // caffe_gpu_sqr(N, a, y); } template void GPUMathBackend::add(const int N, const Dtype* a, const Dtype* b, - Dtype* y) { - NOT_IMPLEMENTED; + Dtype* y) { + NOT_IMPLEMENTED; // caffe_gpu_add(N, a, b, y); } template void GPUMathBackend::sub(const int N, const Dtype* a, const Dtype* b, - Dtype* y) { - NOT_IMPLEMENTED; + Dtype* y) { + NOT_IMPLEMENTED; // caffe_gpu_sub(N, a, b, y); } template void GPUMathBackend::mul(const int N, const Dtype* a, const Dtype* b, - Dtype* y) { - caffe_gpu_mul(N, a, b, y); + Dtype* y) { + caffe_gpu_mul(N, a, b, y); } template void GPUMathBackend::div(const int N, const Dtype* a, const Dtype* b, - Dtype* y) { - caffe_gpu_div(N, a, b, y); + Dtype* y) { + caffe_gpu_div(N, a, b, y); } template void GPUMathBackend::powx(const int N, const Dtype* a, const Dtype b, - Dtype* y) { - caffe_gpu_powx(N, a, b, y); + Dtype* y) { + caffe_gpu_powx(N, a, b, y); } template -void GPUMathBackend::rng_uniform(const int N, const Dtype a, const Dtype b, - Dtype* r) { - caffe_gpu_rng_uniform(N, a, b, r); +void GPUMathBackend::rng_uniform(const int N, const Dtype a, + const Dtype b, Dtype* r) { + caffe_gpu_rng_uniform(N, a, b, r); } template void GPUMathBackend::rng_gaussian(const int N, const Dtype mu, - const Dtype sigma, Dtype* r) { - caffe_gpu_rng_gaussian(N, mu, sigma, r); + const Dtype sigma, Dtype* r) { + caffe_gpu_rng_gaussian(N, mu, sigma, r); } template void GPUMathBackend::rng_bernoulli(const int N, const Dtype p, int* r) { - NOT_IMPLEMENTED; + NOT_IMPLEMENTED; // caffe_gpu_rng_bernoulli(N, p, r); } template void GPUMathBackend::exp(const int N, const Dtype* a, Dtype* y) { - NOT_IMPLEMENTED; + NOT_IMPLEMENTED; // caffe_gpu_exp(N, a, y); } template void GPUMathBackend::dot(const int N, const Dtype* x, const Dtype* y, - Dtype* out) { - caffe_gpu_dot(N, x, y, out); + Dtype* out) { + caffe_gpu_dot(N, x, y, out); } template void GPUMathBackend::hamming_distance(const int N, const Dtype* x, - const Dtype* y, uint32_t* out) { - *out = caffe_gpu_hamming_distance(N, x, y); + const Dtype* y, uint32_t* out) { + *out = caffe_gpu_hamming_distance(N, x, y); } template // Returns the sum of the absolute values of the elements of vector x void GPUMathBackend::asum(const int N, const Dtype* x, Dtype* y) { - caffe_gpu_asum(N, x, y); + caffe_gpu_asum(N, x, y); } template void GPUMathBackend::sign(const int N, const Dtype* x, Dtype* y) { - caffe_gpu_sign(N, x, y); + caffe_gpu_sign(N, x, y); } template void GPUMathBackend::sgnbit(const int N, const Dtype* x, Dtype* y) { - caffe_gpu_sgnbit(N, x, y); + caffe_gpu_sgnbit(N, x, y); } template void GPUMathBackend::fabs(const int N, const Dtype* x, Dtype* y) { - caffe_gpu_fabs(N, x, y); + caffe_gpu_fabs(N, x, y); } template -void GPUMathBackend::scale(const int N, const Dtype alpha, const Dtype *x, - Dtype* y) { - caffe_gpu_scale(N, alpha, x, y); +void GPUMathBackend::scale(const int N, const Dtype alpha, + const Dtype *x, Dtype* y) { + caffe_gpu_scale(N, alpha, x, y); } INSTANTIATE_CLASS(GPUMathBackend); diff --git a/src/caffe/util/math_backends.cpp b/src/caffe/util/math_backends.cpp index 85cd7b8db9e..3830887980f 100644 --- a/src/caffe/util/math_backends.cpp +++ b/src/caffe/util/math_backends.cpp @@ -7,22 +7,22 @@ namespace caffe { template MathBackend* MathBackendFactory::GetMathBackend() { - switch (Caffe::mode()) { - case Caffe::CPU: - return cpu_math_backend_; - case Caffe::GPU: - return gpu_math_backend_; - default: - LOG(FATAL) << "Unknown caffe mode."; - return static_cast*>(NULL); - } + switch (Caffe::mode()) { + case Caffe::CPU: + return cpu_math_backend_; + case Caffe::GPU: + return gpu_math_backend_; + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast*>(NULL); + } } template MathBackend* MathBackendFactory::cpu_math_backend_ = - new CPUMathBackend(); + new CPUMathBackend(); template MathBackend* MathBackendFactory::gpu_math_backend_ = - new GPUMathBackend(); + new GPUMathBackend(); INSTANTIATE_CLASS(MathBackendFactory); From 8b11f51af281fdca630153d575bf4720404146e0 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Thu, 15 May 2014 17:02:14 +0800 Subject: [PATCH 05/75] Allow Layer::Forward and Backward to be overridden --- include/caffe/layer.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp index 5c6944dc63a..d6a103f920a 100644 --- a/include/caffe/layer.hpp +++ b/include/caffe/layer.hpp @@ -44,9 +44,9 @@ class Layer { // Forward and backward wrappers. You should implement the cpu and // gpu specific implementations instead, and should not change these // functions. - inline Dtype Forward(const vector*>& bottom, + virtual Dtype Forward(const vector*>& bottom, vector*>* top); - inline void Backward(const vector*>& top, + virtual void Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom); @@ -169,7 +169,7 @@ class Layer { // gpu specific implementations instead, and should not change these // functions. template -inline Dtype Layer::Forward(const vector*>& bottom, +Dtype Layer::Forward(const vector*>& bottom, vector*>* top) { switch (Caffe::mode()) { case Caffe::CPU: @@ -183,7 +183,7 @@ inline Dtype Layer::Forward(const vector*>& bottom, } template -inline void Layer::Backward(const vector*>& top, +void Layer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { switch (Caffe::mode()) { From 88ba49b7d3296b08fdce15644d5c4c9e2591ce01 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Thu, 15 May 2014 17:04:40 +0800 Subject: [PATCH 06/75] Use zero as the default return values of Blob data and diff methods --- src/caffe/blob.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp index 91fd04f3ff2..4e0a14160cc 100644 --- a/src/caffe/blob.cpp +++ b/src/caffe/blob.cpp @@ -105,7 +105,7 @@ const Dtype* Blob::const_data() const { return gpu_data(); default: LOG(FATAL) << "Unknown caffe mode."; - return static_cast(NULL); + return static_cast(0); } } @@ -118,7 +118,7 @@ const Dtype* Blob::const_diff() const { return gpu_diff(); default: LOG(FATAL) << "Unknown caffe mode."; - return static_cast(NULL); + return static_cast(0); } } @@ -131,7 +131,7 @@ Dtype* Blob::mutable_data() { return mutable_gpu_data(); default: LOG(FATAL) << "Unknown caffe mode."; - return static_cast(NULL); + return static_cast(0); } } @@ -144,7 +144,7 @@ Dtype* Blob::mutable_diff() { return mutable_gpu_diff(); default: LOG(FATAL) << "Unknown caffe mode."; - return static_cast(NULL); + return static_cast(0); } } From e534e50a12b39622fbc65de50f4f32072e33acc9 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Thu, 15 May 2014 17:04:50 +0800 Subject: [PATCH 07/75] Add and test device type ignorant Forward and Backward in ConcatLayer --- include/caffe/vision_layers.hpp | 11 ++++ src/caffe/layers/concat_layer.cpp | 76 ++++++++++++++++++++++++++++ src/caffe/test/test_concat_layer.cpp | 8 +-- 3 files changed, 92 insertions(+), 3 deletions(-) diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index b68dcbf6e83..43ed431e722 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -61,6 +61,11 @@ class ConcatLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const bool propagate_down, + vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_CONCAT; @@ -78,6 +83,12 @@ class ConcatLayer : public Layer { virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, vector*>* bottom); + virtual Dtype Forward_xpu(const vector*>& bottom, + vector*>* top); + virtual void Backward_xpu(const vector*>& top, + const bool propagate_down, + vector*>* bottom); + Blob col_bob_; int count_; int num_; diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp index 4c894ddffc4..0c2ca01b2c7 100644 --- a/src/caffe/layers/concat_layer.cpp +++ b/src/caffe/layers/concat_layer.cpp @@ -100,6 +100,82 @@ void ConcatLayer::Backward_cpu(const vector*>& top, } // concat_dim_ is guaranteed to be 0 or 1 by SetUp. } +template +Dtype ConcatLayer::Forward(const vector*>& bottom, + vector*>* top) { + return Forward_xpu(bottom, top); +} + +template +void ConcatLayer::Backward(const vector*>& top, + const bool propagate_down, + vector*>* bottom) { + return Backward_xpu(top, propagate_down, bottom); +} + +template +Dtype ConcatLayer::Forward_xpu(const vector*>& bottom, + vector*>* top) { + Dtype* top_data = (*top)[0]->mutable_data(); + if (concat_dim_== 0) { + int offset_num = 0; + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->const_data(); + int num_elem = bottom[i]->count(); + this->math_->copy(num_elem, bottom_data, + top_data+(*top)[0]->offset(offset_num)); + offset_num += bottom[i]->num(); + } + } else if (concat_dim_ == 1) { + int offset_channel = 0; + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->const_data(); + int num_elem = + bottom[i]->channels()*bottom[i]->height()*bottom[i]->width(); + for (int n = 0; n < num_; ++n) { + this->math_->copy(num_elem, bottom_data+bottom[i]->offset(n), + top_data+(*top)[0]->offset(n, offset_channel)); + } + offset_channel += bottom[i]->channels(); + } + } else { + LOG(FATAL) << "concat_dim along dim" << concat_dim_ << + " not implemented yet"; + } + return Dtype(0.); +} + +template +void ConcatLayer::Backward_xpu(const vector*>& top, + const bool propagate_down, vector*>* bottom) { + const Dtype* top_diff = top[0]->const_diff(); + if (concat_dim_ == 0) { + int offset_num = 0; + for (int i = 0; i < bottom->size(); ++i) { + Blob* blob = (*bottom)[i]; + Dtype* bottom_diff = blob->mutable_diff(); + this->math_->copy(blob->count(), + top_diff+top[0]->offset(offset_num), bottom_diff); + offset_num += blob->num(); + } + } else if (concat_dim_ == 1) { + int offset_channel = 0; + for (int i = 0; i < bottom->size(); ++i) { + Blob* blob = (*bottom)[i]; + Dtype* bottom_diff = blob->mutable_diff(); + int num_elem = blob->channels()*blob->height()*blob->width(); + for (int n = 0; n < num_; ++n) { + this->math_->copy(num_elem, top_diff+top[0]->offset(n, offset_channel), + bottom_diff+blob->offset(n)); + } + offset_channel += blob->channels(); + } + } else { + LOG(FATAL) << "concat_dim along dim" << concat_dim_ << + " not implemented yet"; + } +} + INSTANTIATE_CLASS(ConcatLayer); } // namespace caffe diff --git a/src/caffe/test/test_concat_layer.cpp b/src/caffe/test/test_concat_layer.cpp index 72e3c902cf1..66c50723379 100644 --- a/src/caffe/test/test_concat_layer.cpp +++ b/src/caffe/test/test_concat_layer.cpp @@ -84,8 +84,8 @@ TYPED_TEST(ConcatLayerTest, TestSetupChannels) { TYPED_TEST(ConcatLayerTest, TestCPUNum) { LayerParameter layer_param; - ConcatLayer layer(layer_param); Caffe::set_mode(Caffe::CPU); + ConcatLayer layer(layer_param); layer.SetUp(this->blob_bottom_vec_0, &(this->blob_top_vec_)); layer.Forward(this->blob_bottom_vec_0, &(this->blob_top_vec_)); for (int n = 0; n < this->blob_top_->num(); ++n) { @@ -93,7 +93,8 @@ TYPED_TEST(ConcatLayerTest, TestCPUNum) { for (int h = 0; h < this->blob_top_->height(); ++h) { for (int w = 0; w < this->blob_top_->width(); ++w) { EXPECT_EQ(this->blob_top_->data_at(n, c, h, w), - this->blob_bottom_vec_0[0]->data_at(n, c, h, w)); + this->blob_bottom_vec_0[0]->data_at(n, c, h, w)) << + "n " << n << ", c " << c << ", h " << h << ", w " << w; } } } @@ -101,7 +102,8 @@ TYPED_TEST(ConcatLayerTest, TestCPUNum) { for (int h = 0; h < this->blob_top_->height(); ++h) { for (int w = 0; w < this->blob_top_->width(); ++w) { EXPECT_EQ(this->blob_top_->data_at(n, c+3, h, w), - this->blob_bottom_vec_0[1]->data_at(n, c, h, w)); + this->blob_bottom_vec_0[1]->data_at(n, c, h, w)) << + "n " << n << ", c " << c << ", h " << h << ", w " << w; } } } From 54b86aa267434ed05b5c2952455acdc43a3817ea Mon Sep 17 00:00:00 2001 From: Kai Li Date: Thu, 15 May 2014 18:23:20 +0800 Subject: [PATCH 08/75] Add default implementations of Layer::Forward_cpu and Backward_cpu --- include/caffe/layer.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp index d6a103f920a..b3d7e76214e 100644 --- a/include/caffe/layer.hpp +++ b/include/caffe/layer.hpp @@ -105,7 +105,7 @@ class Layer { // Forward functions: compute the layer output // (and loss layers return the loss; other layers return the dummy value 0.) virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top) = 0; + vector*>* top) { return static_cast(0); } // If no gpu code is provided, we will simply use cpu code. virtual Dtype Forward_gpu(const vector*>& bottom, vector*>* top) { @@ -117,7 +117,7 @@ class Layer { // for the bottom blobs if propagate_down is true. virtual void Backward_cpu(const vector*>& top, const vector& propagate_down, - vector*>* bottom) = 0; + vector*>* bottom) { return; } virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, vector*>* bottom) { From 0e65f7f1b959f1e584653278993da733c33051da Mon Sep 17 00:00:00 2001 From: Kai Li Date: Thu, 15 May 2014 18:24:13 +0800 Subject: [PATCH 09/75] Directly implement device neutral Forward and Backward in ConcatLayer --- include/caffe/vision_layers.hpp | 15 ------ src/caffe/layers/concat_layer.cpp | 73 ---------------------------- src/caffe/layers/concat_layer.cu | 79 ------------------------------- 3 files changed, 167 deletions(-) delete mode 100644 src/caffe/layers/concat_layer.cu diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 43ed431e722..9c0e2389ccb 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -74,21 +74,6 @@ class ConcatLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 1; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - - virtual Dtype Forward_xpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_xpu(const vector*>& top, - const bool propagate_down, - vector*>* bottom); - Blob col_bob_; int count_; int num_; diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp index 0c2ca01b2c7..c28be6b2bd2 100644 --- a/src/caffe/layers/concat_layer.cpp +++ b/src/caffe/layers/concat_layer.cpp @@ -40,82 +40,9 @@ void ConcatLayer::SetUp(const vector*>& bottom, CHECK_EQ(count_, (*top)[0]->count()); } -template -Dtype ConcatLayer::Forward_cpu(const vector*>& bottom, - vector*>* top) { - Dtype* top_data = (*top)[0]->mutable_cpu_data(); - if (concat_dim_== 0) { - int offset_num = 0; - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->cpu_data(); - int num_elem = bottom[i]->count(); - caffe_copy(num_elem, bottom_data, top_data+(*top)[0]->offset(offset_num)); - offset_num += bottom[i]->num(); - } - } else if (concat_dim_ == 1) { - int offset_channel = 0; - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->cpu_data(); - int num_elem = - bottom[i]->channels()*bottom[i]->height()*bottom[i]->width(); - for (int n = 0; n < num_; ++n) { - caffe_copy(num_elem, bottom_data+bottom[i]->offset(n), - top_data+(*top)[0]->offset(n, offset_channel)); - } - offset_channel += bottom[i]->channels(); - } // concat_dim_ is guaranteed to be 0 or 1 by SetUp. - } - return Dtype(0.); -} - -template -void ConcatLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - if (concat_dim_ == 0) { - int offset_num = 0; - for (int i = 0; i < bottom->size(); ++i) { - Blob* blob = (*bottom)[i]; - if (propagate_down[i]) { - Dtype* bottom_diff = blob->mutable_cpu_diff(); - caffe_copy(blob->count(), top_diff + top[0]->offset(offset_num), - bottom_diff); - } - offset_num += blob->num(); - } - } else if (concat_dim_ == 1) { - int offset_channel = 0; - for (int i = 0; i < bottom->size(); ++i) { - Blob* blob = (*bottom)[i]; - if (propagate_down[i]) { - Dtype* bottom_diff = blob->mutable_cpu_diff(); - int num_elem = blob->channels()*blob->height()*blob->width(); - for (int n = 0; n < num_; ++n) { - caffe_copy(num_elem, top_diff + top[0]->offset(n, offset_channel), - bottom_diff + blob->offset(n)); - } - } - offset_channel += blob->channels(); - } - } // concat_dim_ is guaranteed to be 0 or 1 by SetUp. -} - template Dtype ConcatLayer::Forward(const vector*>& bottom, vector*>* top) { - return Forward_xpu(bottom, top); -} - -template -void ConcatLayer::Backward(const vector*>& top, - const bool propagate_down, - vector*>* bottom) { - return Backward_xpu(top, propagate_down, bottom); -} - -template -Dtype ConcatLayer::Forward_xpu(const vector*>& bottom, - vector*>* top) { Dtype* top_data = (*top)[0]->mutable_data(); if (concat_dim_== 0) { int offset_num = 0; diff --git a/src/caffe/layers/concat_layer.cu b/src/caffe/layers/concat_layer.cu deleted file mode 100644 index ca0cf0c1b5b..00000000000 --- a/src/caffe/layers/concat_layer.cu +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -Dtype ConcatLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - Dtype* top_data = (*top)[0]->mutable_gpu_data(); - if (concat_dim_ == 0) { - int offset_num = 0; - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - caffe_gpu_copy(bottom[i]->count(), bottom_data, - top_data + (*top)[0]->offset(offset_num)); - offset_num += bottom[i]->num(); - } - } else if (concat_dim_ == 1) { - int offset_channel = 0; - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - int num_elem = - bottom[i]->channels() * bottom[i]->height() * bottom[i]->width(); - for (int n = 0; n < num_; ++n) { - caffe_gpu_copy(num_elem, bottom_data+bottom[i]->offset(n), - top_data + (*top)[0]->offset(n, offset_channel)); - } - offset_channel += bottom[i]->channels(); - } - } else { - LOG(FATAL) << "concat_dim along dim" << concat_dim_ << - " not implemented yet"; - } - return Dtype(0.); -} - -template -void ConcatLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - if (concat_dim_ == 0) { - int offset_num = 0; - for (int i = 0; i < bottom->size(); ++i) { - Blob* blob = (*bottom)[i]; - if (propagate_down[i]) { - Dtype* bottom_diff = blob->mutable_gpu_diff(); - caffe_gpu_copy(blob->count(), top_diff + top[0]->offset(offset_num), - bottom_diff); - } - offset_num += blob->num(); - } - } else if (concat_dim_ == 1) { - int offset_channel = 0; - for (int i = 0; i < bottom->size(); ++i) { - Blob* blob = (*bottom)[i]; - if (propagate_down[i]) { - Dtype* bottom_diff = blob->mutable_gpu_diff(); - int num_elem = blob->channels()*blob->height()*blob->width(); - for (int n = 0; n < num_; ++n) { - caffe_gpu_copy(num_elem, top_diff + top[0]->offset(n, offset_channel), - bottom_diff + blob->offset(n)); - } - } - offset_channel += blob->channels(); - } - } else { - LOG(FATAL) << "concat_dim along dim" << concat_dim_ << - " not implemented yet"; - } -} - -INSTANTIATE_CLASS(ConcatLayer); - -} // namespace caffe From 3d2e16e30f084127829510ab7dddc323f753faa5 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sat, 24 May 2014 23:57:58 +0800 Subject: [PATCH 10/75] Generalize the math backend classes into device wrapper classes --- include/caffe/layer.hpp | 6 +-- .../util/{math_backends.hpp => device.hpp} | 24 ++++----- src/caffe/layers/concat_layer.cpp | 10 ++-- .../{cpu_math_backends.cpp => cpu_device.cpp} | 54 +++++++++---------- ...{gpu_math_backends.cpp => cuda_device.cpp} | 54 +++++++++---------- src/caffe/util/device.cpp | 30 +++++++++++ src/caffe/util/math_backends.cpp | 29 ---------- 7 files changed, 104 insertions(+), 103 deletions(-) rename include/caffe/util/{math_backends.hpp => device.hpp} (94%) rename src/caffe/util/{cpu_math_backends.cpp => cpu_device.cpp} (61%) rename src/caffe/util/{gpu_math_backends.cpp => cuda_device.cpp} (62%) create mode 100644 src/caffe/util/device.cpp delete mode 100644 src/caffe/util/math_backends.cpp diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp index b3d7e76214e..c4ba24c488e 100644 --- a/include/caffe/layer.hpp +++ b/include/caffe/layer.hpp @@ -9,7 +9,7 @@ #include "caffe/blob.hpp" #include "caffe/common.hpp" #include "caffe/proto/caffe.pb.h" -#include "caffe/util/math_backends.hpp" +#include "caffe/util/device.hpp" using std::string; using std::vector; @@ -23,7 +23,7 @@ class Layer { // to SetUp(), where the dimensions of the bottom blobs are provided to the // layer. explicit Layer(const LayerParameter& param) - : layer_param_(param), math_(MathBackendFactory::GetMathBackend()) { + : layer_param_(param), device_(DeviceFactory::GetDevice()) { // The only thing we do is to copy blobs if there are any. if (layer_param_.blobs_size() > 0) { blobs_.resize(layer_param_.blobs_size()); @@ -100,7 +100,7 @@ class Layer { vector > > blobs_; // The math backend abstracts the CPU and the GPU specific // implementation details - MathBackend* math_; + Device* device_; // Forward functions: compute the layer output // (and loss layers return the loss; other layers return the dummy value 0.) diff --git a/include/caffe/util/math_backends.hpp b/include/caffe/util/device.hpp similarity index 94% rename from include/caffe/util/math_backends.hpp rename to include/caffe/util/device.hpp index 1fc78161680..32d627fcd4e 100644 --- a/include/caffe/util/math_backends.hpp +++ b/include/caffe/util/device.hpp @@ -13,9 +13,9 @@ namespace caffe { template -class MathBackend { +class Device { public: - virtual ~MathBackend() { + virtual ~Device() { } virtual void gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const Dtype alpha, @@ -81,11 +81,11 @@ class MathBackend { }; template -class CPUMathBackend : public MathBackend { +class CPUDevice : public Device { public: - CPUMathBackend() { + CPUDevice() { } - virtual ~CPUMathBackend() { + virtual ~CPUDevice() { } virtual void gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const Dtype alpha, @@ -147,11 +147,11 @@ class CPUMathBackend : public MathBackend { }; template -class GPUMathBackend : public MathBackend { +class GPUDevice : public Device { public: - GPUMathBackend() { + GPUDevice() { } - virtual ~GPUMathBackend() { + virtual ~GPUDevice() { } virtual void gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const Dtype alpha, @@ -213,12 +213,12 @@ class GPUMathBackend : public MathBackend { }; template -class MathBackendFactory { +class DeviceFactory { public: - static MathBackend* GetMathBackend(); + static Device* GetDevice(); private: - static MathBackend* cpu_math_backend_; - static MathBackend* gpu_math_backend_; + static Device* cpu_device_; + static Device* gpu_device_; }; } // namespace caffe diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp index c28be6b2bd2..af1bc29748e 100644 --- a/src/caffe/layers/concat_layer.cpp +++ b/src/caffe/layers/concat_layer.cpp @@ -4,7 +4,7 @@ #include "caffe/layer.hpp" #include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" +#include "caffe/util/device.hpp" namespace caffe { @@ -49,7 +49,7 @@ Dtype ConcatLayer::Forward(const vector*>& bottom, for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->const_data(); int num_elem = bottom[i]->count(); - this->math_->copy(num_elem, bottom_data, + this->device_->copy(num_elem, bottom_data, top_data+(*top)[0]->offset(offset_num)); offset_num += bottom[i]->num(); } @@ -60,7 +60,7 @@ Dtype ConcatLayer::Forward(const vector*>& bottom, int num_elem = bottom[i]->channels()*bottom[i]->height()*bottom[i]->width(); for (int n = 0; n < num_; ++n) { - this->math_->copy(num_elem, bottom_data+bottom[i]->offset(n), + this->device_->copy(num_elem, bottom_data+bottom[i]->offset(n), top_data+(*top)[0]->offset(n, offset_channel)); } offset_channel += bottom[i]->channels(); @@ -81,7 +81,7 @@ void ConcatLayer::Backward_xpu(const vector*>& top, for (int i = 0; i < bottom->size(); ++i) { Blob* blob = (*bottom)[i]; Dtype* bottom_diff = blob->mutable_diff(); - this->math_->copy(blob->count(), + this->device_->copy(blob->count(), top_diff+top[0]->offset(offset_num), bottom_diff); offset_num += blob->num(); } @@ -92,7 +92,7 @@ void ConcatLayer::Backward_xpu(const vector*>& top, Dtype* bottom_diff = blob->mutable_diff(); int num_elem = blob->channels()*blob->height()*blob->width(); for (int n = 0; n < num_; ++n) { - this->math_->copy(num_elem, top_diff+top[0]->offset(n, offset_channel), + this->device_->copy(num_elem, top_diff+top[0]->offset(n, offset_channel), bottom_diff+blob->offset(n)); } offset_channel += blob->channels(); diff --git a/src/caffe/util/cpu_math_backends.cpp b/src/caffe/util/cpu_device.cpp similarity index 61% rename from src/caffe/util/cpu_math_backends.cpp rename to src/caffe/util/cpu_device.cpp index 8ec25c33bb6..52d2f6ec463 100644 --- a/src/caffe/util/cpu_math_backends.cpp +++ b/src/caffe/util/cpu_device.cpp @@ -1,11 +1,11 @@ // Copyright 2014 BVLC and contributors. #include "caffe/common.hpp" -#include "caffe/util/math_backends.hpp" +#include "caffe/util/device.hpp" namespace caffe { template -void CPUMathBackend::gemm(const CBLAS_TRANSPOSE TransA, +void CPUDevice::gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const Dtype alpha, const Dtype* A, const Dtype* B, @@ -14,141 +14,141 @@ void CPUMathBackend::gemm(const CBLAS_TRANSPOSE TransA, } template -void CPUMathBackend::gemv(const CBLAS_TRANSPOSE TransA, const int M, +void CPUDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, Dtype* y) { caffe_cpu_gemv(TransA, M, N, alpha, A, x, beta, y); } template -void CPUMathBackend::axpy(const int N, const Dtype alpha, const Dtype* X, +void CPUDevice::axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y) { caffe_axpy(N, alpha, X, Y); } template -void CPUMathBackend::axpby(const int N, const Dtype alpha, +void CPUDevice::axpby(const int N, const Dtype alpha, const Dtype* X, const Dtype beta, Dtype* Y) { caffe_cpu_axpby(N, alpha, X, beta, Y); } template -void CPUMathBackend::copy(const int N, const Dtype *X, Dtype *Y) { +void CPUDevice::copy(const int N, const Dtype *X, Dtype *Y) { caffe_copy(N, X, Y); } template -void CPUMathBackend::set(const int N, const Dtype alpha, Dtype *X) { +void CPUDevice::set(const int N, const Dtype alpha, Dtype *X) { caffe_set(N, alpha, X); } template -void CPUMathBackend::add_scalar(const int N, const Dtype alpha, +void CPUDevice::add_scalar(const int N, const Dtype alpha, Dtype *X) { caffe_add_scalar(N, alpha, X); } template -void CPUMathBackend::scal(const int N, const Dtype alpha, Dtype *X) { +void CPUDevice::scal(const int N, const Dtype alpha, Dtype *X) { caffe_scal(N, alpha, X); } template -void CPUMathBackend::sqr(const int N, const Dtype* a, Dtype* y) { +void CPUDevice::sqr(const int N, const Dtype* a, Dtype* y) { caffe_sqr(N, a, y); } template -void CPUMathBackend::add(const int N, const Dtype* a, const Dtype* b, +void CPUDevice::add(const int N, const Dtype* a, const Dtype* b, Dtype* y) { caffe_add(N, a, b, y); } template -void CPUMathBackend::sub(const int N, const Dtype* a, const Dtype* b, +void CPUDevice::sub(const int N, const Dtype* a, const Dtype* b, Dtype* y) { caffe_sub(N, a, b, y); } template -void CPUMathBackend::mul(const int N, const Dtype* a, const Dtype* b, +void CPUDevice::mul(const int N, const Dtype* a, const Dtype* b, Dtype* y) { caffe_mul(N, a, b, y); } template -void CPUMathBackend::div(const int N, const Dtype* a, const Dtype* b, +void CPUDevice::div(const int N, const Dtype* a, const Dtype* b, Dtype* y) { caffe_div(N, a, b, y); } template -void CPUMathBackend::powx(const int N, const Dtype* a, const Dtype b, +void CPUDevice::powx(const int N, const Dtype* a, const Dtype b, Dtype* y) { caffe_powx(N, a, b, y); } template -void CPUMathBackend::rng_uniform(const int N, const Dtype a, +void CPUDevice::rng_uniform(const int N, const Dtype a, const Dtype b, Dtype* r) { caffe_rng_uniform(N, a, b, r); } template -void CPUMathBackend::rng_gaussian(const int N, const Dtype mu, +void CPUDevice::rng_gaussian(const int N, const Dtype mu, const Dtype sigma, Dtype* r) { caffe_rng_gaussian(N, mu, sigma, r); } template -void CPUMathBackend::rng_bernoulli(const int N, const Dtype p, int* r) { +void CPUDevice::rng_bernoulli(const int N, const Dtype p, int* r) { caffe_rng_bernoulli(N, p, r); } template -void CPUMathBackend::exp(const int N, const Dtype* a, Dtype* y) { +void CPUDevice::exp(const int N, const Dtype* a, Dtype* y) { caffe_exp(N, a, y); } template -void CPUMathBackend::dot(const int N, const Dtype* x, const Dtype* y, +void CPUDevice::dot(const int N, const Dtype* x, const Dtype* y, Dtype* out) { *out = caffe_cpu_dot(N, x, y); } template -void CPUMathBackend::hamming_distance(const int N, const Dtype* x, +void CPUDevice::hamming_distance(const int N, const Dtype* x, const Dtype* y, uint32_t* out) { *out = caffe_cpu_hamming_distance(N, x, y); } template // Returns the sum of the absolute values of the elements of vector x -void CPUMathBackend::asum(const int N, const Dtype* x, Dtype* y) { +void CPUDevice::asum(const int N, const Dtype* x, Dtype* y) { *y = caffe_cpu_asum(N, x); } template -void CPUMathBackend::sign(const int N, const Dtype* x, Dtype* y) { +void CPUDevice::sign(const int N, const Dtype* x, Dtype* y) { caffe_cpu_sign(N, x, y); } template -void CPUMathBackend::sgnbit(const int N, const Dtype* x, Dtype* y) { +void CPUDevice::sgnbit(const int N, const Dtype* x, Dtype* y) { caffe_gpu_sgnbit(N, x, y); } template -void CPUMathBackend::fabs(const int N, const Dtype* x, Dtype* y) { +void CPUDevice::fabs(const int N, const Dtype* x, Dtype* y) { caffe_cpu_fabs(N, x, y); } template -void CPUMathBackend::scale(const int N, const Dtype alpha, +void CPUDevice::scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y) { caffe_cpu_scale(N, alpha, x, y); } -INSTANTIATE_CLASS(CPUMathBackend); +INSTANTIATE_CLASS(CPUDevice); } // namespace caffe diff --git a/src/caffe/util/gpu_math_backends.cpp b/src/caffe/util/cuda_device.cpp similarity index 62% rename from src/caffe/util/gpu_math_backends.cpp rename to src/caffe/util/cuda_device.cpp index 5521da9c11c..056b1c8ddb9 100644 --- a/src/caffe/util/gpu_math_backends.cpp +++ b/src/caffe/util/cuda_device.cpp @@ -1,11 +1,11 @@ // Copyright 2014 BVLC and contributors. #include "caffe/common.hpp" -#include "caffe/util/math_backends.hpp" +#include "caffe/util/device.hpp" namespace caffe { template -void GPUMathBackend::gemm(const CBLAS_TRANSPOSE TransA, +void GPUDevice::gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const Dtype alpha, const Dtype* A, const Dtype* B, @@ -14,146 +14,146 @@ void GPUMathBackend::gemm(const CBLAS_TRANSPOSE TransA, } template -void GPUMathBackend::gemv(const CBLAS_TRANSPOSE TransA, const int M, +void GPUDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, Dtype* y) { caffe_gpu_gemv(TransA, M, N, alpha, A, x, beta, y); } template -void GPUMathBackend::axpy(const int N, const Dtype alpha, const Dtype* X, +void GPUDevice::axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y) { caffe_gpu_axpy(N, alpha, X, Y); } template -void GPUMathBackend::axpby(const int N, const Dtype alpha, +void GPUDevice::axpby(const int N, const Dtype alpha, const Dtype* X, const Dtype beta, Dtype* Y) { caffe_gpu_axpby(N, alpha, X, beta, Y); } template -void GPUMathBackend::copy(const int N, const Dtype *X, Dtype *Y) { +void GPUDevice::copy(const int N, const Dtype *X, Dtype *Y) { caffe_gpu_copy(N, X, Y); } template -void GPUMathBackend::set(const int N, const Dtype alpha, Dtype *X) { +void GPUDevice::set(const int N, const Dtype alpha, Dtype *X) { caffe_gpu_set(N, alpha, X); } template -void GPUMathBackend::add_scalar(const int N, const Dtype alpha, +void GPUDevice::add_scalar(const int N, const Dtype alpha, Dtype *X) { caffe_gpu_add_scalar(N, alpha, X); } template -void GPUMathBackend::scal(const int N, const Dtype alpha, Dtype *X) { +void GPUDevice::scal(const int N, const Dtype alpha, Dtype *X) { caffe_gpu_scal(N, alpha, X); } template -void GPUMathBackend::sqr(const int N, const Dtype* a, Dtype* y) { +void GPUDevice::sqr(const int N, const Dtype* a, Dtype* y) { NOT_IMPLEMENTED; // caffe_gpu_sqr(N, a, y); } template -void GPUMathBackend::add(const int N, const Dtype* a, const Dtype* b, +void GPUDevice::add(const int N, const Dtype* a, const Dtype* b, Dtype* y) { NOT_IMPLEMENTED; // caffe_gpu_add(N, a, b, y); } template -void GPUMathBackend::sub(const int N, const Dtype* a, const Dtype* b, +void GPUDevice::sub(const int N, const Dtype* a, const Dtype* b, Dtype* y) { NOT_IMPLEMENTED; // caffe_gpu_sub(N, a, b, y); } template -void GPUMathBackend::mul(const int N, const Dtype* a, const Dtype* b, +void GPUDevice::mul(const int N, const Dtype* a, const Dtype* b, Dtype* y) { caffe_gpu_mul(N, a, b, y); } template -void GPUMathBackend::div(const int N, const Dtype* a, const Dtype* b, +void GPUDevice::div(const int N, const Dtype* a, const Dtype* b, Dtype* y) { caffe_gpu_div(N, a, b, y); } template -void GPUMathBackend::powx(const int N, const Dtype* a, const Dtype b, +void GPUDevice::powx(const int N, const Dtype* a, const Dtype b, Dtype* y) { caffe_gpu_powx(N, a, b, y); } template -void GPUMathBackend::rng_uniform(const int N, const Dtype a, +void GPUDevice::rng_uniform(const int N, const Dtype a, const Dtype b, Dtype* r) { caffe_gpu_rng_uniform(N, a, b, r); } template -void GPUMathBackend::rng_gaussian(const int N, const Dtype mu, +void GPUDevice::rng_gaussian(const int N, const Dtype mu, const Dtype sigma, Dtype* r) { caffe_gpu_rng_gaussian(N, mu, sigma, r); } template -void GPUMathBackend::rng_bernoulli(const int N, const Dtype p, int* r) { +void GPUDevice::rng_bernoulli(const int N, const Dtype p, int* r) { NOT_IMPLEMENTED; // caffe_gpu_rng_bernoulli(N, p, r); } template -void GPUMathBackend::exp(const int N, const Dtype* a, Dtype* y) { +void GPUDevice::exp(const int N, const Dtype* a, Dtype* y) { NOT_IMPLEMENTED; // caffe_gpu_exp(N, a, y); } template -void GPUMathBackend::dot(const int N, const Dtype* x, const Dtype* y, +void GPUDevice::dot(const int N, const Dtype* x, const Dtype* y, Dtype* out) { caffe_gpu_dot(N, x, y, out); } template -void GPUMathBackend::hamming_distance(const int N, const Dtype* x, +void GPUDevice::hamming_distance(const int N, const Dtype* x, const Dtype* y, uint32_t* out) { *out = caffe_gpu_hamming_distance(N, x, y); } template // Returns the sum of the absolute values of the elements of vector x -void GPUMathBackend::asum(const int N, const Dtype* x, Dtype* y) { +void GPUDevice::asum(const int N, const Dtype* x, Dtype* y) { caffe_gpu_asum(N, x, y); } template -void GPUMathBackend::sign(const int N, const Dtype* x, Dtype* y) { +void GPUDevice::sign(const int N, const Dtype* x, Dtype* y) { caffe_gpu_sign(N, x, y); } template -void GPUMathBackend::sgnbit(const int N, const Dtype* x, Dtype* y) { +void GPUDevice::sgnbit(const int N, const Dtype* x, Dtype* y) { caffe_gpu_sgnbit(N, x, y); } template -void GPUMathBackend::fabs(const int N, const Dtype* x, Dtype* y) { +void GPUDevice::fabs(const int N, const Dtype* x, Dtype* y) { caffe_gpu_fabs(N, x, y); } template -void GPUMathBackend::scale(const int N, const Dtype alpha, +void GPUDevice::scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y) { caffe_gpu_scale(N, alpha, x, y); } -INSTANTIATE_CLASS(GPUMathBackend); +INSTANTIATE_CLASS(GPUDevice); } // namespace caffe diff --git a/src/caffe/util/device.cpp b/src/caffe/util/device.cpp new file mode 100644 index 00000000000..4eb59dc6861 --- /dev/null +++ b/src/caffe/util/device.cpp @@ -0,0 +1,30 @@ +// Copyright 2014 BVLC and contributors. + +#include "caffe/common.hpp" +#include "caffe/util/device.hpp" + +namespace caffe { + +template +Device* +DeviceFactory::GetDevice() { + switch (Caffe::mode()) { + case Caffe::CPU: + return cpu_device_; + case Caffe::GPU: + return gpu_device_; + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast*>(NULL); + } +} + +template +Device* DeviceFactory::cpu_device_ = new CPUDevice(); + +template +Device* DeviceFactory::gpu_device_ = new GPUDevice(); + +INSTANTIATE_CLASS(DeviceFactory); + +} // namespace caffe diff --git a/src/caffe/util/math_backends.cpp b/src/caffe/util/math_backends.cpp deleted file mode 100644 index 3830887980f..00000000000 --- a/src/caffe/util/math_backends.cpp +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include "caffe/common.hpp" -#include "caffe/util/math_backends.hpp" - -namespace caffe { - -template -MathBackend* MathBackendFactory::GetMathBackend() { - switch (Caffe::mode()) { - case Caffe::CPU: - return cpu_math_backend_; - case Caffe::GPU: - return gpu_math_backend_; - default: - LOG(FATAL) << "Unknown caffe mode."; - return static_cast*>(NULL); - } -} -template -MathBackend* MathBackendFactory::cpu_math_backend_ = - new CPUMathBackend(); -template -MathBackend* MathBackendFactory::gpu_math_backend_ = - new GPUMathBackend(); - -INSTANTIATE_CLASS(MathBackendFactory); - -} // namespace caffe From faef1ff5fccd0e2b9a4aa7f1aa278d7011f80096 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 25 May 2014 22:15:16 +0800 Subject: [PATCH 11/75] Add Device::copy_from_cpu for the data layers --- include/caffe/util/device.hpp | 3 +++ src/caffe/util/cpu_device.cpp | 5 +++++ src/caffe/util/{cuda_device.cpp => gpu_device.cpp} | 5 +++++ 3 files changed, 13 insertions(+) rename src/caffe/util/{cuda_device.cpp => gpu_device.cpp} (96%) diff --git a/include/caffe/util/device.hpp b/include/caffe/util/device.hpp index 32d627fcd4e..e9d95b6c9ce 100644 --- a/include/caffe/util/device.hpp +++ b/include/caffe/util/device.hpp @@ -33,6 +33,7 @@ class Device { const Dtype beta, Dtype* Y) = 0; virtual void copy(const int N, const Dtype *X, Dtype *Y) = 0; + virtual void copy_from_cpu(const int N, const Dtype* X, Dtype* Y) = 0; virtual void set(const int N, const Dtype alpha, Dtype *X) = 0; @@ -101,6 +102,7 @@ class CPUDevice : public Device { const Dtype beta, Dtype* Y); virtual void copy(const int N, const Dtype *X, Dtype *Y); + virtual void copy_from_cpu(const int N, const Dtype* X, Dtype* Y); virtual void set(const int N, const Dtype alpha, Dtype *X); @@ -167,6 +169,7 @@ class GPUDevice : public Device { const Dtype beta, Dtype* Y); virtual void copy(const int N, const Dtype *X, Dtype *Y); + virtual void copy_from_cpu(const int N, const Dtype* X, Dtype* Y); virtual void set(const int N, const Dtype alpha, Dtype *X); diff --git a/src/caffe/util/cpu_device.cpp b/src/caffe/util/cpu_device.cpp index 52d2f6ec463..6c8e5f42f47 100644 --- a/src/caffe/util/cpu_device.cpp +++ b/src/caffe/util/cpu_device.cpp @@ -37,6 +37,11 @@ void CPUDevice::copy(const int N, const Dtype *X, Dtype *Y) { caffe_copy(N, X, Y); } +template +void CPUDevice::copy_from_cpu(const int N, const Dtype *X, Dtype *Y) { + caffe_copy(N, X, Y); +} + template void CPUDevice::set(const int N, const Dtype alpha, Dtype *X) { caffe_set(N, alpha, X); diff --git a/src/caffe/util/cuda_device.cpp b/src/caffe/util/gpu_device.cpp similarity index 96% rename from src/caffe/util/cuda_device.cpp rename to src/caffe/util/gpu_device.cpp index 056b1c8ddb9..d19126ae665 100644 --- a/src/caffe/util/cuda_device.cpp +++ b/src/caffe/util/gpu_device.cpp @@ -37,6 +37,11 @@ void GPUDevice::copy(const int N, const Dtype *X, Dtype *Y) { caffe_gpu_copy(N, X, Y); } +template +void GPUDevice::copy_from_cpu(const int N, const Dtype *X, Dtype *Y) { + CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyHostToDevice)); +} + template void GPUDevice::set(const int N, const Dtype alpha, Dtype *X) { caffe_gpu_set(N, alpha, X); From c10fa56a457ca3ead70aae22b58c1f8fd65caedc Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 25 May 2014 22:16:05 +0800 Subject: [PATCH 12/75] Unify the CPU and the GPU Forward of the DataLayer --- include/caffe/data_layers.hpp | 6 ++--- src/caffe/layers/data_layer.cpp | 12 +++++----- src/caffe/layers/data_layer.cu | 39 --------------------------------- 3 files changed, 9 insertions(+), 48 deletions(-) delete mode 100644 src/caffe/layers/data_layer.cu diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp index 2c6be551d8f..04c71074423 100644 --- a/include/caffe/data_layers.hpp +++ b/include/caffe/data_layers.hpp @@ -111,6 +111,8 @@ class DataLayer : public Layer { virtual ~DataLayer(); virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_DATA; @@ -120,10 +122,6 @@ class DataLayer : public Layer { virtual inline int MaxTopBlobs() const { return 2; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); virtual void Backward_cpu(const vector*>& top, const vector& propagate_down, vector*>* bottom) {} virtual void Backward_gpu(const vector*>& top, diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp index 29c4fec8ca4..06be98cf3f4 100644 --- a/src/caffe/layers/data_layer.cpp +++ b/src/caffe/layers/data_layer.cpp @@ -346,16 +346,18 @@ unsigned int DataLayer::PrefetchRand() { } template -Dtype DataLayer::Forward_cpu(const vector*>& bottom, +Dtype DataLayer::Forward(const vector*>& bottom, vector*>* top) { // First, join the thread JoinPrefetchThread(); // Copy the data - caffe_copy(prefetch_data_->count(), prefetch_data_->cpu_data(), - (*top)[0]->mutable_cpu_data()); + this->device_->copy_from_cpu( + prefetch_data_->count(), prefetch_data_->cpu_data(), + (*top)[0]->mutable_cpu_data()); if (output_labels_) { - caffe_copy(prefetch_label_->count(), prefetch_label_->cpu_data(), - (*top)[1]->mutable_cpu_data()); + this->device_->copy_from_cpu( + prefetch_label_->count(), prefetch_label_->cpu_data(), + (*top)[1]->mutable_cpu_data()); } // Start a new prefetch thread CreatePrefetchThread(); diff --git a/src/caffe/layers/data_layer.cu b/src/caffe/layers/data_layer.cu deleted file mode 100644 index 2ff9a292b3e..00000000000 --- a/src/caffe/layers/data_layer.cu +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include -#include -#include - -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/vision_layers.hpp" - -using std::string; - -namespace caffe { - -template -Dtype DataLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - // First, join the thread - JoinPrefetchThread(); - // Copy the data - CUDA_CHECK(cudaMemcpy((*top)[0]->mutable_gpu_data(), - prefetch_data_->cpu_data(), sizeof(Dtype) * prefetch_data_->count(), - cudaMemcpyHostToDevice)); - if (output_labels_) { - CUDA_CHECK(cudaMemcpy((*top)[1]->mutable_gpu_data(), - prefetch_label_->cpu_data(), sizeof(Dtype) * prefetch_label_->count(), - cudaMemcpyHostToDevice)); - } - // Start a new prefetch thread - CreatePrefetchThread(); - return Dtype(0.); -} - -INSTANTIATE_CLASS(DataLayer); - -} // namespace caffe From a38d30a276dd06ecbb4a3a76aa2e62b1da5a15a6 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 25 May 2014 22:22:31 +0800 Subject: [PATCH 13/75] Unify the CPU and the GPU Forward of the ImageDataLayer --- include/caffe/data_layers.hpp | 6 ++-- src/caffe/layers/image_data_layer.cpp | 12 ++++---- src/caffe/layers/image_data_layer.cu | 43 --------------------------- 3 files changed, 9 insertions(+), 52 deletions(-) delete mode 100644 src/caffe/layers/image_data_layer.cu diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp index 04c71074423..9ff62a3e111 100644 --- a/include/caffe/data_layers.hpp +++ b/include/caffe/data_layers.hpp @@ -196,6 +196,8 @@ class ImageDataLayer : public Layer { virtual ~ImageDataLayer(); virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_IMAGE_DATA; @@ -204,10 +206,6 @@ class ImageDataLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 2; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); virtual void Backward_cpu(const vector*>& top, const vector& propagate_down, vector*>* bottom) {} virtual void Backward_gpu(const vector*>& top, diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp index 1f7368e7e4d..da97a4aa9cb 100644 --- a/src/caffe/layers/image_data_layer.cpp +++ b/src/caffe/layers/image_data_layer.cpp @@ -273,15 +273,17 @@ unsigned int ImageDataLayer::PrefetchRand() { } template -Dtype ImageDataLayer::Forward_cpu(const vector*>& bottom, +Dtype ImageDataLayer::Forward(const vector*>& bottom, vector*>* top) { // First, join the thread JoinPrefetchThread(); // Copy the data - caffe_copy(prefetch_data_->count(), prefetch_data_->cpu_data(), - (*top)[0]->mutable_cpu_data()); - caffe_copy(prefetch_label_->count(), prefetch_label_->cpu_data(), - (*top)[1]->mutable_cpu_data()); + this->device_->copy_from_cpu( + prefetch_data_->count(), prefetch_data_->cpu_data(), + (*top)[0]->mutable_cpu_data()); + this->device_->copy_from_cpu( + prefetch_label_->count(), prefetch_label_->cpu_data(), + (*top)[1]->mutable_cpu_data()); // Start a new prefetch thread CreatePrefetchThread(); return Dtype(0.); diff --git a/src/caffe/layers/image_data_layer.cu b/src/caffe/layers/image_data_layer.cu deleted file mode 100644 index 98047297d80..00000000000 --- a/src/caffe/layers/image_data_layer.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include -#include -#include -#include - -#include -#include -#include // NOLINT(readability/streams) -#include // NOLINT(readability/streams) - -#include "caffe/blob.hpp" -#include "caffe/common.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/vision_layers.hpp" - -using std::string; -using std::pair; - -namespace caffe { - -template -Dtype ImageDataLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - // First, join the thread - JoinPrefetchThread(); - // Copy the data - CUDA_CHECK(cudaMemcpy((*top)[0]->mutable_gpu_data(), - prefetch_data_->cpu_data(), sizeof(Dtype) * prefetch_data_->count(), - cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy((*top)[1]->mutable_gpu_data(), - prefetch_label_->cpu_data(), sizeof(Dtype) * prefetch_label_->count(), - cudaMemcpyHostToDevice)); - // Start a new prefetch thread - CreatePrefetchThread(); - return Dtype(0.); -} - -INSTANTIATE_CLASS(ImageDataLayer); - -} // namespace caffe From e36233349e239c5e7bcbf8a54aff8366fe793acc Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 25 May 2014 22:34:16 +0800 Subject: [PATCH 14/75] Unify the CPU and the GPU Forward of the HDF5DataLayer --- include/caffe/data_layers.hpp | 6 +-- src/caffe/layers/hdf5_data_layer.cpp | 15 +++---- src/caffe/layers/hdf5_data_layer.cu | 59 ---------------------------- 3 files changed, 10 insertions(+), 70 deletions(-) delete mode 100644 src/caffe/layers/hdf5_data_layer.cu diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp index 9ff62a3e111..843fdadefc2 100644 --- a/include/caffe/data_layers.hpp +++ b/include/caffe/data_layers.hpp @@ -67,6 +67,8 @@ class HDF5DataLayer : public Layer { virtual ~HDF5DataLayer(); virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_HDF5_DATA; @@ -75,10 +77,6 @@ class HDF5DataLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 2; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); virtual void Backward_cpu(const vector*>& top, const vector& propagate_down, vector*>* bottom) {} virtual void Backward_gpu(const vector*>& top, diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index 2ba7fa77f45..1be99ae5dbb 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -86,7 +86,7 @@ void HDF5DataLayer::SetUp(const vector*>& bottom, } template -Dtype HDF5DataLayer::Forward_cpu(const vector*>& bottom, +Dtype HDF5DataLayer::Forward(const vector*>& bottom, vector*>* top) { const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); const int data_count = (*top)[0]->count() / (*top)[0]->num(); @@ -104,12 +104,13 @@ Dtype HDF5DataLayer::Forward_cpu(const vector*>& bottom, } current_row_ = 0; } - memcpy(&(*top)[0]->mutable_cpu_data()[i * data_count], - &data_blob_.cpu_data()[current_row_ * data_count], - sizeof(Dtype) * data_count); - memcpy(&(*top)[1]->mutable_cpu_data()[i * label_data_count], - &label_blob_.cpu_data()[current_row_ * label_data_count], - sizeof(Dtype) * label_data_count); + this->device_->copy_from_cpu( + data_count, &data_blob_.cpu_data()[current_row_ * data_count], + &(*top)[0]->mutable_cpu_data()[i * data_count]); + this->device_->copy_from_cpu( + label_data_count, + &label_blob_.cpu_data()[current_row_ * label_data_count], + &(*top)[1]->mutable_cpu_data()[i * label_data_count]); } return Dtype(0.); } diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu deleted file mode 100644 index b2b09ef7dd1..00000000000 --- a/src/caffe/layers/hdf5_data_layer.cu +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2014 BVLC and contributors. -/* -TODO: -- only load parts of the file, in accordance with a prototxt param "max_mem" -*/ - -#include -#include -#include - -#include "hdf5.h" -#include "hdf5_hl.h" - -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/vision_layers.hpp" - -using std::string; - -namespace caffe { - -template -Dtype HDF5DataLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); - const int data_count = (*top)[0]->count() / (*top)[0]->num(); - const int label_data_count = (*top)[1]->count() / (*top)[1]->num(); - - for (int i = 0; i < batch_size; ++i, ++current_row_) { - if (current_row_ == data_blob_.num()) { - if (num_files_ > 1) { - current_file_ += 1; - - if (current_file_ == num_files_) { - current_file_ = 0; - LOG(INFO) << "looping around to first file"; - } - - LoadHDF5FileData(hdf_filenames_[current_file_].c_str()); - } - current_row_ = 0; - } - CUDA_CHECK(cudaMemcpy( - &(*top)[0]->mutable_gpu_data()[i * data_count], - &data_blob_.cpu_data()[current_row_ * data_count], - sizeof(Dtype) * data_count, - cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy( - &(*top)[1]->mutable_gpu_data()[i * label_data_count], - &label_blob_.cpu_data()[current_row_ * label_data_count], - sizeof(Dtype) * label_data_count, - cudaMemcpyHostToDevice)); - } - return Dtype(0.); -} - -INSTANTIATE_CLASS(HDF5DataLayer); - -} // namespace caffe From 37e3f6751735a7c8ef541f353cb67eb5d6556ac2 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 25 May 2014 22:43:13 +0800 Subject: [PATCH 15/75] Unify the CPU and the GPU Forward & Backward of the HDF5OutputDataLayer --- include/caffe/data_layers.hpp | 12 +++---- src/caffe/layers/hdf5_output_layer.cpp | 22 ++++++++---- src/caffe/layers/hdf5_output_layer.cu | 49 -------------------------- 3 files changed, 19 insertions(+), 64 deletions(-) delete mode 100644 src/caffe/layers/hdf5_output_layer.cu diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp index 843fdadefc2..b984e6140cd 100644 --- a/include/caffe/data_layers.hpp +++ b/include/caffe/data_layers.hpp @@ -31,6 +31,8 @@ class HDF5OutputLayer : public Layer { virtual ~HDF5OutputLayer(); virtual void SetUp(const vector*>& bottom, vector*>* top) {} + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_HDF5_OUTPUT; @@ -42,14 +44,8 @@ class HDF5OutputLayer : public Layer { inline std::string file_name() const { return file_name_; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); + virtual void Backward(const vector*>& top, + const bool propagate_down, vector*>* bottom) { return; } virtual void SaveBlobs(); std::string file_name_; diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp index 3a513b9c366..519f541fb30 100644 --- a/src/caffe/layers/hdf5_output_layer.cpp +++ b/src/caffe/layers/hdf5_output_layer.cpp @@ -42,7 +42,15 @@ void HDF5OutputLayer::SaveBlobs() { } template -Dtype HDF5OutputLayer::Forward_cpu(const vector*>& bottom, +void HDF5OutputLayer::SetUp(const vector*>& bottom, + vector*>* top) { + // TODO: no limit on the number of blobs + CHECK_EQ(bottom.size(), 2) << "HDF5OutputLayer takes two blobs as input."; + CHECK_EQ(top->size(), 0) << "HDF5OutputLayer takes no output blobs."; +} + +template +Dtype HDF5OutputLayer::Forward(const vector*>& bottom, vector*>* top) { CHECK_GE(bottom.size(), 2); CHECK_EQ(bottom[0]->num(), bottom[1]->num()); @@ -54,12 +62,12 @@ Dtype HDF5OutputLayer::Forward_cpu(const vector*>& bottom, const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); for (int i = 0; i < bottom[0]->num(); ++i) { - memcpy(&data_blob_.mutable_cpu_data()[i * data_datum_dim], - &bottom[0]->cpu_data()[i * data_datum_dim], - sizeof(Dtype) * data_datum_dim); - memcpy(&label_blob_.mutable_cpu_data()[i * label_datum_dim], - &bottom[1]->cpu_data()[i * label_datum_dim], - sizeof(Dtype) * label_datum_dim); + this->device_->copy_from_cpu( + data_datum_dim, &bottom[0]->cpu_data()[i * data_datum_dim], + &data_blob_.mutable_cpu_data()[i * data_datum_dim]); + this->device_->copy_from_cpu( + label_datum_dim, &bottom[1]->cpu_data()[i * label_datum_dim], + &label_blob_.mutable_cpu_data()[i * label_datum_dim]); } SaveBlobs(); return Dtype(0.); diff --git a/src/caffe/layers/hdf5_output_layer.cu b/src/caffe/layers/hdf5_output_layer.cu deleted file mode 100644 index 59505ee6acf..00000000000 --- a/src/caffe/layers/hdf5_output_layer.cu +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include - -#include "hdf5.h" -#include "hdf5_hl.h" - -#include "caffe/blob.hpp" -#include "caffe/common.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { -using std::vector; - -template -Dtype HDF5OutputLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - CHECK_GE(bottom.size(), 2); - CHECK_EQ(bottom[0]->num(), bottom[1]->num()); - data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); - label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(), - bottom[1]->height(), bottom[1]->width()); - const int data_datum_dim = bottom[0]->count() / bottom[0]->num(); - const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); - - for (int i = 0; i < bottom[0]->num(); ++i) { - CUDA_CHECK(cudaMemcpy(&data_blob_.mutable_cpu_data()[i * data_datum_dim], - &bottom[0]->gpu_data()[i * data_datum_dim], - sizeof(Dtype) * data_datum_dim, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(&label_blob_.mutable_cpu_data()[i * label_datum_dim], - &bottom[1]->gpu_data()[i * label_datum_dim], - sizeof(Dtype) * label_datum_dim, cudaMemcpyDeviceToHost)); - } - SaveBlobs(); - return Dtype(0.); -} - -template -void HDF5OutputLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - return; -} - -INSTANTIATE_CLASS(HDF5OutputLayer); - -} // namespace caffe From 881a728ea1428750f5944f6dfbe77b55d65cdb35 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 25 May 2014 22:46:58 +0800 Subject: [PATCH 16/75] Merge the CPU and the GPU Backward of the data layers --- include/caffe/data_layers.hpp | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp index b984e6140cd..5d0e5db2f4a 100644 --- a/include/caffe/data_layers.hpp +++ b/include/caffe/data_layers.hpp @@ -33,6 +33,8 @@ class HDF5OutputLayer : public Layer { vector*>* top) {} virtual Dtype Forward(const vector*>& bottom, vector*>* top); + virtual void Backward(const vector*>& top, + const bool propagate_down, vector*>* bottom) { return; } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_HDF5_OUTPUT; @@ -44,8 +46,6 @@ class HDF5OutputLayer : public Layer { inline std::string file_name() const { return file_name_; } protected: - virtual void Backward(const vector*>& top, - const bool propagate_down, vector*>* bottom) { return; } virtual void SaveBlobs(); std::string file_name_; @@ -65,6 +65,8 @@ class HDF5DataLayer : public Layer { vector*>* top); virtual Dtype Forward(const vector*>& bottom, vector*>* top); + virtual void Backward(const vector*>& top, + const bool propagate_down, vector*>* bottom) { return; } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_HDF5_DATA; @@ -73,10 +75,6 @@ class HDF5DataLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 2; } protected: - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} virtual void LoadHDF5FileData(const char* filename); std::vector hdf_filenames_; @@ -107,6 +105,8 @@ class DataLayer : public Layer { vector*>* top); virtual Dtype Forward(const vector*>& bottom, vector*>* top); + virtual void Backward(const vector*>& top, + const bool propagate_down, vector*>* bottom) { return; } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_DATA; @@ -116,11 +116,6 @@ class DataLayer : public Layer { virtual inline int MaxTopBlobs() const { return 2; } protected: - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - virtual void CreatePrefetchThread(); virtual void JoinPrefetchThread(); virtual unsigned int PrefetchRand(); @@ -192,6 +187,8 @@ class ImageDataLayer : public Layer { vector*>* top); virtual Dtype Forward(const vector*>& bottom, vector*>* top); + virtual void Backward(const vector*>& top, + const bool propagate_down, vector*>* bottom) { return; } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_IMAGE_DATA; @@ -200,11 +197,6 @@ class ImageDataLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 2; } protected: - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - virtual void ShuffleImages(); virtual void CreatePrefetchThread(); @@ -283,6 +275,8 @@ class WindowDataLayer : public Layer { virtual ~WindowDataLayer(); virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual void Backward(const vector*>& top, + const bool propagate_down, vector*>* bottom) { return; } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_WINDOW_DATA; @@ -295,10 +289,6 @@ class WindowDataLayer : public Layer { vector*>* top); virtual Dtype Forward_gpu(const vector*>& bottom, vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} virtual void CreatePrefetchThread(); virtual void JoinPrefetchThread(); From dd703f97f4621f2cf3eb017e16e215bcdf8bec77 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 25 May 2014 22:51:14 +0800 Subject: [PATCH 17/75] Consolidate the CPU and GPU Forward of the WindowDataLayer --- include/caffe/data_layers.hpp | 7 ++-- src/caffe/layers/window_data_layer.cpp | 12 ++++--- src/caffe/layers/window_data_layer.cu | 44 -------------------------- 3 files changed, 9 insertions(+), 54 deletions(-) delete mode 100644 src/caffe/layers/window_data_layer.cu diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp index 5d0e5db2f4a..5151c278e11 100644 --- a/include/caffe/data_layers.hpp +++ b/include/caffe/data_layers.hpp @@ -275,6 +275,8 @@ class WindowDataLayer : public Layer { virtual ~WindowDataLayer(); virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); virtual void Backward(const vector*>& top, const bool propagate_down, vector*>* bottom) { return; } @@ -285,11 +287,6 @@ class WindowDataLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 2; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void CreatePrefetchThread(); virtual void JoinPrefetchThread(); virtual unsigned int PrefetchRand(); diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp index fd4860f98be..449d3e58c3e 100644 --- a/src/caffe/layers/window_data_layer.cpp +++ b/src/caffe/layers/window_data_layer.cpp @@ -439,15 +439,17 @@ unsigned int WindowDataLayer::PrefetchRand() { } template -Dtype WindowDataLayer::Forward_cpu(const vector*>& bottom, +Dtype WindowDataLayer::Forward(const vector*>& bottom, vector*>* top) { // First, join the thread JoinPrefetchThread(); // Copy the data - caffe_copy(prefetch_data_->count(), prefetch_data_->cpu_data(), - (*top)[0]->mutable_cpu_data()); - caffe_copy(prefetch_label_->count(), prefetch_label_->cpu_data(), - (*top)[1]->mutable_cpu_data()); + this->device_->copy_from_cpu( + prefetch_data_->count(), prefetch_data_->cpu_data(), + (*top)[0]->mutable_cpu_data()); + this->device_->copy_from_cpu( + prefetch_label_->count(), prefetch_label_->cpu_data(), + (*top)[1]->mutable_cpu_data()); // Start a new prefetch thread CreatePrefetchThread(); return Dtype(0.); diff --git a/src/caffe/layers/window_data_layer.cu b/src/caffe/layers/window_data_layer.cu deleted file mode 100644 index bc49fef6545..00000000000 --- a/src/caffe/layers/window_data_layer.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2014 BVLC and contributors. -// -// Based on data_layer.cpp by Yangqing Jia. - -#include -#include - -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/vision_layers.hpp" - -using std::string; -using std::map; -using std::pair; - -// caffe.proto > LayerParameter > WindowDataParameter -// 'source' field specifies the window_file -// 'crop_size' indicates the desired warped size - -namespace caffe { - -template -Dtype WindowDataLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - // First, join the thread - JoinPrefetchThread(); - // Copy the data - CUDA_CHECK(cudaMemcpy((*top)[0]->mutable_gpu_data(), - prefetch_data_->cpu_data(), sizeof(Dtype) * prefetch_data_->count(), - cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy((*top)[1]->mutable_gpu_data(), - prefetch_label_->cpu_data(), sizeof(Dtype) * prefetch_label_->count(), - cudaMemcpyHostToDevice)); - // Start a new prefetch thread - CreatePrefetchThread(); - return Dtype(0.); -} - -INSTANTIATE_CLASS(WindowDataLayer); - -} // namespace caffe From 8fc3e1d86b4ab02e9cc94c5a9e257a0c8e1b4cb1 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 25 May 2014 22:59:52 +0800 Subject: [PATCH 18/75] Deduplicate the CPU and the GPU Forward & Backward of the FlattenLayer --- include/caffe/vision_layers.hpp | 13 ++++--------- src/caffe/layers/flatten_layer.cpp | 4 ++-- src/caffe/layers/flatten_layer.cu | 26 -------------------------- 3 files changed, 6 insertions(+), 37 deletions(-) delete mode 100644 src/caffe/layers/flatten_layer.cu diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 9c0e2389ccb..4338d6c176d 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -166,6 +166,10 @@ class FlattenLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const bool propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_FLATTEN; @@ -174,15 +178,6 @@ class FlattenLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 1; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - int count_; }; diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp index 9494da9a255..d9799db026f 100644 --- a/src/caffe/layers/flatten_layer.cpp +++ b/src/caffe/layers/flatten_layer.cpp @@ -21,14 +21,14 @@ void FlattenLayer::SetUp(const vector*>& bottom, } template -Dtype FlattenLayer::Forward_cpu(const vector*>& bottom, +Dtype FlattenLayer::Forward(const vector*>& bottom, vector*>* top) { (*top)[0]->ShareData(*bottom[0]); return Dtype(0.); } template -void FlattenLayer::Backward_cpu(const vector*>& top, +void FlattenLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { (*bottom)[0]->ShareDiff(*top[0]); } diff --git a/src/caffe/layers/flatten_layer.cu b/src/caffe/layers/flatten_layer.cu deleted file mode 100644 index 68add383c48..00000000000 --- a/src/caffe/layers/flatten_layer.cu +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -Dtype FlattenLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - (*top)[0]->ShareData(*bottom[0]); - return Dtype(0.); -} - -template -void FlattenLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - (*bottom)[0]->ShareDiff(*top[0]); -} - -INSTANTIATE_CLASS(FlattenLayer); - -} // namespace caffe From d80ba9f2739d2feb718bb9b25c9272a5e5385a8a Mon Sep 17 00:00:00 2001 From: Kai Li Date: Mon, 26 May 2014 08:44:03 +0800 Subject: [PATCH 19/75] Use the newly implemented caffe_gpu_{add,sub} in the GPU device wrapper --- src/caffe/util/gpu_device.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/caffe/util/gpu_device.cpp b/src/caffe/util/gpu_device.cpp index d19126ae665..c388f5f10b7 100644 --- a/src/caffe/util/gpu_device.cpp +++ b/src/caffe/util/gpu_device.cpp @@ -67,15 +67,13 @@ void GPUDevice::sqr(const int N, const Dtype* a, Dtype* y) { template void GPUDevice::add(const int N, const Dtype* a, const Dtype* b, Dtype* y) { - NOT_IMPLEMENTED; -// caffe_gpu_add(N, a, b, y); + caffe_gpu_add(N, a, b, y); } template void GPUDevice::sub(const int N, const Dtype* a, const Dtype* b, Dtype* y) { - NOT_IMPLEMENTED; -// caffe_gpu_sub(N, a, b, y); + caffe_gpu_sub(N, a, b, y); } template From 7bf9e67979ad4c3c2dc805fb2abdbb176abfc4a4 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Mon, 26 May 2014 08:48:15 +0800 Subject: [PATCH 20/75] Replace caffe_gpu_{copy+axpy} with sub in SigmoidCrossEntropyLossLayer --- src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu index 8f7275827e2..1a58f4a5320 100644 --- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu +++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu @@ -50,8 +50,7 @@ void SigmoidCrossEntropyLossLayer::Backward_gpu( const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data(); const Dtype* target = (*bottom)[1]->gpu_data(); Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff(); - caffe_gpu_copy(count, sigmoid_output_data, bottom_diff); - caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff); + caffe_gpu_sub(count, sigmoid_output_data, bottom_diff); // Scale down gradient caffe_gpu_scal(count, Dtype(1) / num, bottom_diff); } From 3ff2afb2abe1a5e0df255567acccf0b1403d9ed9 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Mon, 26 May 2014 08:53:03 +0800 Subject: [PATCH 21/75] Unify the CPU/GPU Forward/Backward of the SigmoidCrossEntropyLossLayer --- include/caffe/loss_layers.hpp | 13 ++-- .../sigmoid_cross_entropy_loss_layer.cpp | 8 +-- .../sigmoid_cross_entropy_loss_layer.cu | 62 ------------------- 3 files changed, 8 insertions(+), 75 deletions(-) delete mode 100644 src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp index 3a4d41662fd..2d9cb39b3ab 100644 --- a/include/caffe/loss_layers.hpp +++ b/include/caffe/loss_layers.hpp @@ -103,21 +103,16 @@ class SigmoidCrossEntropyLossLayer : public LossLayer { sigmoid_output_(new Blob()) {} virtual void FurtherSetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const bool propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_SIGMOID_CROSS_ENTROPY_LOSS; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - shared_ptr > sigmoid_layer_; // sigmoid_output stores the output of the sigmoid layer. shared_ptr > sigmoid_output_; diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp index 8cb830ff248..df2db2cd2f3 100644 --- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp +++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp @@ -25,7 +25,7 @@ void SigmoidCrossEntropyLossLayer::FurtherSetUp( } template -Dtype SigmoidCrossEntropyLossLayer::Forward_cpu( +Dtype SigmoidCrossEntropyLossLayer::Forward( const vector*>& bottom, vector*>* top) { // The forward pass computes the sigmoid outputs. sigmoid_bottom_vec_[0] = bottom[0]; @@ -48,7 +48,7 @@ Dtype SigmoidCrossEntropyLossLayer::Forward_cpu( } template -void SigmoidCrossEntropyLossLayer::Backward_cpu( +void SigmoidCrossEntropyLossLayer::Backward( const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[1]) { @@ -62,9 +62,9 @@ void SigmoidCrossEntropyLossLayer::Backward_cpu( const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data(); const Dtype* target = (*bottom)[1]->cpu_data(); Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); - caffe_sub(count, sigmoid_output_data, target, bottom_diff); + this->device_->sub(count, sigmoid_output_data, target, bottom_diff); // Scale down gradient - caffe_scal(count, Dtype(1) / num, bottom_diff); + this->device_->scal(count, Dtype(1) / num, bottom_diff); } } diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu deleted file mode 100644 index 1a58f4a5320..00000000000 --- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" - -using std::max; - -namespace caffe { - -template -Dtype SigmoidCrossEntropyLossLayer::Forward_gpu( - const vector*>& bottom, vector*>* top) { - // The forward pass computes the sigmoid outputs. - sigmoid_bottom_vec_[0] = bottom[0]; - sigmoid_layer_->Forward(sigmoid_bottom_vec_, &sigmoid_top_vec_); - // Compute the loss (negative log likelihood) - const int count = bottom[0]->count(); - const int num = bottom[0]->num(); - // Stable version of loss computation from input data - const Dtype* input_data = bottom[0]->cpu_data(); - const Dtype* target = bottom[1]->cpu_data(); - Dtype loss = 0; - for (int i = 0; i < count; ++i) { - loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) - - log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0))); - } - if (top->size() == 1) { - (*top)[0]->mutable_cpu_data()[0] = loss / num; - } - return loss / num; -} - -template -void SigmoidCrossEntropyLossLayer::Backward_gpu( - const vector*>& top, const vector& propagate_down, - vector*>* bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type_name() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - // First, compute the diff - const int count = (*bottom)[0]->count(); - const int num = (*bottom)[0]->num(); - const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data(); - const Dtype* target = (*bottom)[1]->gpu_data(); - Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff(); - caffe_gpu_sub(count, sigmoid_output_data, bottom_diff); - // Scale down gradient - caffe_gpu_scal(count, Dtype(1) / num, bottom_diff); - } -} - -INSTANTIATE_CLASS(SigmoidCrossEntropyLossLayer); - - -} // namespace caffe From 071a5d04c39392523d232b77c3e366350f2f11f9 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Mon, 26 May 2014 09:00:49 +0800 Subject: [PATCH 22/75] Merge the CPU/GPU Forward/Backward of the SoftmaxWithLossLayer --- src/caffe/layers/softmax_loss_layer.cpp | 8 +++---- src/caffe/layers/softmax_loss_layer.cu | 32 ------------------------- 2 files changed, 4 insertions(+), 36 deletions(-) delete mode 100644 src/caffe/layers/softmax_loss_layer.cu diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index 1a3601aa9e6..a884505d533 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -32,13 +32,13 @@ void SoftmaxWithLossLayer::SetUp(const vector*>& bottom, } template -Dtype SoftmaxWithLossLayer::Forward_cpu( +Dtype SoftmaxWithLossLayer::Forward( const vector*>& bottom, vector*>* top) { // The forward pass computes the softmax prob values. softmax_bottom_vec_[0] = bottom[0]; softmax_layer_->Forward(softmax_bottom_vec_, &softmax_top_vec_); - const Dtype* prob_data = prob_.cpu_data(); - const Dtype* label = bottom[1]->cpu_data(); + const Dtype* prob_data = prob_.const_data(); + const Dtype* label = bottom[1]->const_data(); int num = prob_.num(); int dim = prob_.count() / num; Dtype loss = 0; @@ -74,7 +74,7 @@ void SoftmaxWithLossLayer::Backward_cpu(const vector*>& top, bottom_diff[i * dim + static_cast(label[i])] -= 1; } // Scale down gradient - caffe_scal(prob_.count(), Dtype(1) / num, bottom_diff); + this->device_scal(prob_.count(), Dtype(1) / num, bottom_diff); } } diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/softmax_loss_layer.cu deleted file mode 100644 index e46be6ba85d..00000000000 --- a/src/caffe/layers/softmax_loss_layer.cu +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" - -using std::max; - -namespace caffe { - -template -Dtype SoftmaxWithLossLayer::Forward_gpu( - const vector*>& bottom, vector*>* top) { - // The forward pass computes the softmax prob values. - return Forward_cpu(bottom, top); -} - -template -void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - // TODO(Yangqing): implement the GPU version of softmax. - Backward_cpu(top, propagate_down, bottom); -} - -INSTANTIATE_CLASS(SoftmaxWithLossLayer); - - -} // namespace caffe From 26f7b34066cea71ba4bd3f0108962680a66065e8 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Mon, 26 May 2014 09:08:25 +0800 Subject: [PATCH 23/75] Use {const, mutable}_{data, diff} in the unified Forward/Backward --- src/caffe/layers/data_layer.cpp | 22 +++++++++---------- src/caffe/layers/hdf5_data_layer.cpp | 8 +++---- src/caffe/layers/hdf5_output_layer.cpp | 8 +++---- src/caffe/layers/image_data_layer.cpp | 22 +++++++++---------- .../sigmoid_cross_entropy_loss_layer.cpp | 6 ++--- src/caffe/layers/window_data_layer.cpp | 22 +++++++++---------- 6 files changed, 44 insertions(+), 44 deletions(-) diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp index 06be98cf3f4..8c785765e96 100644 --- a/src/caffe/layers/data_layer.cpp +++ b/src/caffe/layers/data_layer.cpp @@ -25,10 +25,10 @@ void* DataLayerPrefetch(void* layer_pointer) { CHECK(layer); Datum datum; CHECK(layer->prefetch_data_); - Dtype* top_data = layer->prefetch_data_->mutable_cpu_data(); + Dtype* top_data = layer->prefetch_data_->mutable_data(); Dtype* top_label = NULL; // suppress warnings about uninitialized variables if (layer->output_labels_) { - top_label = layer->prefetch_label_->mutable_cpu_data(); + top_label = layer->prefetch_label_->mutable_data(); } const Dtype scale = layer->layer_param_.data_param().scale(); const int batch_size = layer->layer_param_.data_param().batch_size(); @@ -44,7 +44,7 @@ void* DataLayerPrefetch(void* layer_pointer) { const int height = layer->datum_height_; const int width = layer->datum_width_; const int size = layer->datum_size_; - const Dtype* mean = layer->data_mean_.cpu_data(); + const Dtype* mean = layer->data_mean_.const_data(); for (int item_id = 0; item_id < batch_size; ++item_id) { // get a blob switch (layer->layer_param_.data_param().backend()) { @@ -302,14 +302,14 @@ void DataLayer::SetUp(const vector*>& bottom, data_mean_.Reshape(1, datum_channels_, datum_height_, datum_width_); } // Now, start the prefetch thread. Before calling prefetch, we make two - // cpu_data calls so that the prefetch thread does not accidentally make + // const_data calls so that the prefetch thread does not accidentally make // simultaneous cudaMalloc calls when the main thread is running. In some // GPUs this seems to cause failures if we do not so. - prefetch_data_->mutable_cpu_data(); + prefetch_data_->mutable_data(); if (output_labels_) { - prefetch_label_->mutable_cpu_data(); + prefetch_label_->mutable_data(); } - data_mean_.cpu_data(); + data_mean_.const_data(); DLOG(INFO) << "Initializing prefetch"; CreatePrefetchThread(); DLOG(INFO) << "Prefetch initialized."; @@ -352,12 +352,12 @@ Dtype DataLayer::Forward(const vector*>& bottom, JoinPrefetchThread(); // Copy the data this->device_->copy_from_cpu( - prefetch_data_->count(), prefetch_data_->cpu_data(), - (*top)[0]->mutable_cpu_data()); + prefetch_data_->count(), prefetch_data_->const_data(), + (*top)[0]->mutable_data()); if (output_labels_) { this->device_->copy_from_cpu( - prefetch_label_->count(), prefetch_label_->cpu_data(), - (*top)[1]->mutable_cpu_data()); + prefetch_label_->count(), prefetch_label_->const_data(), + (*top)[1]->mutable_data()); } // Start a new prefetch thread CreatePrefetchThread(); diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index 1be99ae5dbb..cf8dff3507a 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -105,12 +105,12 @@ Dtype HDF5DataLayer::Forward(const vector*>& bottom, current_row_ = 0; } this->device_->copy_from_cpu( - data_count, &data_blob_.cpu_data()[current_row_ * data_count], - &(*top)[0]->mutable_cpu_data()[i * data_count]); + data_count, &data_blob_.const_data()[current_row_ * data_count], + &(*top)[0]->mutable_data()[i * data_count]); this->device_->copy_from_cpu( label_data_count, - &label_blob_.cpu_data()[current_row_ * label_data_count], - &(*top)[1]->mutable_cpu_data()[i * label_data_count]); + &label_blob_.const_data()[current_row_ * label_data_count], + &(*top)[1]->mutable_data()[i * label_data_count]); } return Dtype(0.); } diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp index 519f541fb30..2093b42d760 100644 --- a/src/caffe/layers/hdf5_output_layer.cpp +++ b/src/caffe/layers/hdf5_output_layer.cpp @@ -63,11 +63,11 @@ Dtype HDF5OutputLayer::Forward(const vector*>& bottom, for (int i = 0; i < bottom[0]->num(); ++i) { this->device_->copy_from_cpu( - data_datum_dim, &bottom[0]->cpu_data()[i * data_datum_dim], - &data_blob_.mutable_cpu_data()[i * data_datum_dim]); + data_datum_dim, &bottom[0]->const_data()[i * data_datum_dim], + &data_blob_.mutable_data()[i * data_datum_dim]); this->device_->copy_from_cpu( - label_datum_dim, &bottom[1]->cpu_data()[i * label_datum_dim], - &label_blob_.mutable_cpu_data()[i * label_datum_dim]); + label_datum_dim, &bottom[1]->const_data()[i * label_datum_dim], + &label_blob_.mutable_data()[i * label_datum_dim]); } SaveBlobs(); return Dtype(0.); diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp index da97a4aa9cb..774add7155c 100644 --- a/src/caffe/layers/image_data_layer.cpp +++ b/src/caffe/layers/image_data_layer.cpp @@ -30,8 +30,8 @@ void* ImageDataLayerPrefetch(void* layer_pointer) { CHECK(layer); Datum datum; CHECK(layer->prefetch_data_); - Dtype* top_data = layer->prefetch_data_->mutable_cpu_data(); - Dtype* top_label = layer->prefetch_label_->mutable_cpu_data(); + Dtype* top_data = layer->prefetch_data_->mutable_data(); + Dtype* top_label = layer->prefetch_label_->mutable_data(); ImageDataParameter image_data_param = layer->layer_param_.image_data_param(); const Dtype scale = image_data_param.scale(); const int batch_size = image_data_param.batch_size(); @@ -50,7 +50,7 @@ void* ImageDataLayerPrefetch(void* layer_pointer) { const int width = layer->datum_width_; const int size = layer->datum_size_; const int lines_size = layer->lines_.size(); - const Dtype* mean = layer->data_mean_.cpu_data(); + const Dtype* mean = layer->data_mean_.const_data(); for (int item_id = 0; item_id < batch_size; ++item_id) { // get a blob CHECK_GT(lines_size, layer->lines_id_); @@ -220,12 +220,12 @@ void ImageDataLayer::SetUp(const vector*>& bottom, data_mean_.Reshape(1, datum_channels_, datum_height_, datum_width_); } // Now, start the prefetch thread. Before calling prefetch, we make two - // cpu_data calls so that the prefetch thread does not accidentally make + // const_data calls so that the prefetch thread does not accidentally make // simultaneous cudaMalloc calls when the main thread is running. In some // GPUs this seems to cause failures if we do not so. - prefetch_data_->mutable_cpu_data(); - prefetch_label_->mutable_cpu_data(); - data_mean_.cpu_data(); + prefetch_data_->mutable_data(); + prefetch_label_->mutable_data(); + data_mean_.const_data(); DLOG(INFO) << "Initializing prefetch"; CreatePrefetchThread(); DLOG(INFO) << "Prefetch initialized."; @@ -279,11 +279,11 @@ Dtype ImageDataLayer::Forward(const vector*>& bottom, JoinPrefetchThread(); // Copy the data this->device_->copy_from_cpu( - prefetch_data_->count(), prefetch_data_->cpu_data(), - (*top)[0]->mutable_cpu_data()); + prefetch_data_->count(), prefetch_data_->const_data(), + (*top)[0]->mutable_data()); this->device_->copy_from_cpu( - prefetch_label_->count(), prefetch_label_->cpu_data(), - (*top)[1]->mutable_cpu_data()); + prefetch_label_->count(), prefetch_label_->const_data(), + (*top)[1]->mutable_data()); // Start a new prefetch thread CreatePrefetchThread(); return Dtype(0.); diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp index df2db2cd2f3..627598bde12 100644 --- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp +++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp @@ -34,8 +34,8 @@ Dtype SigmoidCrossEntropyLossLayer::Forward( const int count = bottom[0]->count(); const int num = bottom[0]->num(); // Stable version of loss computation from input data - const Dtype* input_data = bottom[0]->cpu_data(); - const Dtype* target = bottom[1]->cpu_data(); + const Dtype* input_data = bottom[0]->const_data(); + const Dtype* target = bottom[1]->const_data(); Dtype loss = 0; for (int i = 0; i < count; ++i) { loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) - @@ -60,7 +60,7 @@ void SigmoidCrossEntropyLossLayer::Backward( const int count = (*bottom)[0]->count(); const int num = (*bottom)[0]->num(); const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data(); - const Dtype* target = (*bottom)[1]->cpu_data(); + const Dtype* target = (*bottom)[1]->const_data(); Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); this->device_->sub(count, sigmoid_output_data, target, bottom_diff); // Scale down gradient diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp index 449d3e58c3e..5817ab532bd 100644 --- a/src/caffe/layers/window_data_layer.cpp +++ b/src/caffe/layers/window_data_layer.cpp @@ -40,8 +40,8 @@ void* WindowDataLayerPrefetch(void* layer_pointer) { // At each iteration, sample N windows where N*p are foreground (object) // windows and N*(1-p) are background (non-object) windows - Dtype* top_data = layer->prefetch_data_->mutable_cpu_data(); - Dtype* top_label = layer->prefetch_label_->mutable_cpu_data(); + Dtype* top_data = layer->prefetch_data_->mutable_data(); + Dtype* top_label = layer->prefetch_label_->mutable_data(); const Dtype scale = layer->layer_param_.window_data_param().scale(); const int batch_size = layer->layer_param_.window_data_param().batch_size(); const int crop_size = layer->layer_param_.window_data_param().crop_size(); @@ -49,7 +49,7 @@ void* WindowDataLayerPrefetch(void* layer_pointer) { const bool mirror = layer->layer_param_.window_data_param().mirror(); const float fg_fraction = layer->layer_param_.window_data_param().fg_fraction(); - const Dtype* mean = layer->data_mean_.cpu_data(); + const Dtype* mean = layer->data_mean_.const_data(); const int mean_off = (layer->data_mean_.width() - crop_size) / 2; const int mean_width = layer->data_mean_.width(); const int mean_height = layer->data_mean_.height(); @@ -398,12 +398,12 @@ void WindowDataLayer::SetUp(const vector*>& bottom, data_mean_.Reshape(1, channels, crop_size, crop_size); } // Now, start the prefetch thread. Before calling prefetch, we make two - // cpu_data calls so that the prefetch thread does not accidentally make + // const_data calls so that the prefetch thread does not accidentally make // simultaneous cudaMalloc calls when the main thread is running. In some // GPUs this seems to cause failures if we do not so. - prefetch_data_->mutable_cpu_data(); - prefetch_label_->mutable_cpu_data(); - data_mean_.cpu_data(); + prefetch_data_->mutable_data(); + prefetch_label_->mutable_data(); + data_mean_.const_data(); DLOG(INFO) << "Initializing prefetch"; CreatePrefetchThread(); DLOG(INFO) << "Prefetch initialized."; @@ -445,11 +445,11 @@ Dtype WindowDataLayer::Forward(const vector*>& bottom, JoinPrefetchThread(); // Copy the data this->device_->copy_from_cpu( - prefetch_data_->count(), prefetch_data_->cpu_data(), - (*top)[0]->mutable_cpu_data()); + prefetch_data_->count(), prefetch_data_->const_data(), + (*top)[0]->mutable_data()); this->device_->copy_from_cpu( - prefetch_label_->count(), prefetch_label_->cpu_data(), - (*top)[1]->mutable_cpu_data()); + prefetch_label_->count(), prefetch_label_->const_data(), + (*top)[1]->mutable_data()); // Start a new prefetch thread CreatePrefetchThread(); return Dtype(0.); From fc675cab59b6dd2e668d8d98ed64f02f980dfeaa Mon Sep 17 00:00:00 2001 From: Kai Li Date: Mon, 26 May 2014 09:16:02 +0800 Subject: [PATCH 24/75] Unify the CPU/GPU Forward/Backward of the InnerProductLayer --- include/caffe/vision_layers.hpp | 13 ++---- src/caffe/layers/inner_product_layer.cpp | 32 ++++++------- src/caffe/layers/inner_product_layer.cu | 57 ------------------------ 3 files changed, 20 insertions(+), 82 deletions(-) delete mode 100644 src/caffe/layers/inner_product_layer.cu diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 4338d6c176d..90dd8ec5df2 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -224,6 +224,10 @@ class InnerProductLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const bool propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_INNER_PRODUCT; @@ -232,15 +236,6 @@ class InnerProductLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 1; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - int M_; int K_; int N_; diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index ddf55e49b63..28dc4468031 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -57,41 +57,41 @@ void InnerProductLayer::SetUp(const vector*>& bottom, } template -Dtype InnerProductLayer::Forward_cpu(const vector*>& bottom, +Dtype InnerProductLayer::Forward(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); - const Dtype* weight = this->blobs_[0]->cpu_data(); - caffe_cpu_gemm(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1., + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); + const Dtype* weight = this->blobs_[0]->const_data(); + this->device_->gemm(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1., bottom_data, weight, (Dtype)0., top_data); if (bias_term_) { - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1., + this->device_->gemm(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1., reinterpret_cast(bias_multiplier_->cpu_data()), - this->blobs_[1]->cpu_data(), (Dtype)1., top_data); + this->blobs_[1]->const_data(), (Dtype)1., top_data); } return Dtype(0); } template -void InnerProductLayer::Backward_cpu(const vector*>& top, +void InnerProductLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - const Dtype* bottom_data = (*bottom)[0]->cpu_data(); + const Dtype* top_diff = top[0]->const_diff(); + const Dtype* bottom_data = (*bottom)[0]->const_data(); // Gradient with respect to weight - caffe_cpu_gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., - top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_cpu_diff()); + this->device_->gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., + top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_diff()); if (bias_term_) { // Gradient with respect to bias - caffe_cpu_gemv(CblasTrans, M_, N_, (Dtype)1., top_diff, + this->device_->gemv(CblasTrans, M_, N_, (Dtype)1., top_diff, reinterpret_cast(bias_multiplier_->cpu_data()), (Dtype)0., this->blobs_[1]->mutable_cpu_diff()); } if (propagate_down[0]) { // Gradient with respect to bottom data - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., - top_diff, this->blobs_[0]->cpu_data(), (Dtype)0., - (*bottom)[0]->mutable_cpu_diff()); + this->device_->gemm(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., + top_diff, this->blobs_[0]->const_data(), (Dtype)0., + (*bottom)[0]->mutable_diff()); } } diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/inner_product_layer.cu deleted file mode 100644 index 5b95a57b23b..00000000000 --- a/src/caffe/layers/inner_product_layer.cu +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include - -#include - -#include "caffe/blob.hpp" -#include "caffe/common.hpp" -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -Dtype InnerProductLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = (*top)[0]->mutable_gpu_data(); - const Dtype* weight = this->blobs_[0]->gpu_data(); - caffe_gpu_gemm(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1., - bottom_data, weight, (Dtype)0., top_data); - if (bias_term_) { - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1., - reinterpret_cast(bias_multiplier_->gpu_data()), - this->blobs_[1]->gpu_data(), (Dtype)1., top_data); - } - return Dtype(0); -} - -template -void InnerProductLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - vector*>* bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = (*bottom)[0]->gpu_data(); - // Gradient with respect to weight - caffe_gpu_gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., - top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_gpu_diff()); - if (bias_term_) { - // Gradient with respect to bias - caffe_gpu_gemv(CblasTrans, M_, N_, (Dtype)1., top_diff, - reinterpret_cast(bias_multiplier_->gpu_data()), - (Dtype)0., this->blobs_[1]->mutable_gpu_diff()); - } - if (propagate_down[0]) { - // Gradient with respect to bottom data - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., - top_diff, this->blobs_[0]->gpu_data(), (Dtype)0., - (*bottom)[0]->mutable_gpu_diff()); - } -} - -INSTANTIATE_CLASS(InnerProductLayer); - -} // namespace caffe From 7a3faf00dba10e92e93ffbb6a4075c6f0ef37cef Mon Sep 17 00:00:00 2001 From: Kai Li Date: Mon, 26 May 2014 09:20:04 +0800 Subject: [PATCH 25/75] Unify the CPU/GPU Forward/Backward of the SplitLayer --- include/caffe/vision_layers.hpp | 18 +++++++----------- src/caffe/layers/split_layer.cpp | 14 ++++++++++---- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 90dd8ec5df2..1479f52e1fd 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -400,23 +400,19 @@ class SplitLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); - - virtual inline LayerParameter_LayerType type() const { - return LayerParameter_LayerType_SPLIT; - } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int MinTopBlobs() const { return 1; } - - protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, + virtual Dtype Forward(const vector*>& bottom, vector*>* top); +<<<<<<< HEAD virtual void Backward_cpu(const vector*>& top, const vector& propagate_down, vector*>* bottom); virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, vector*>* bottom); +======= + virtual void Backward(const vector*>& top, + const bool propagate_down, vector*>* bottom); +>>>>>>> Unify the CPU/GPU Forward/Backward of the SplitLayer + protected: int count_; }; diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp index 28abd95f5ff..aaa624bb7d3 100644 --- a/src/caffe/layers/split_layer.cpp +++ b/src/caffe/layers/split_layer.cpp @@ -27,7 +27,7 @@ void SplitLayer::SetUp(const vector*>& bottom, } template -Dtype SplitLayer::Forward_cpu(const vector*>& bottom, +Dtype SplitLayer::Forward(const vector*>& bottom, vector*>* top) { for (int i = 0; i < top->size(); ++i) { (*top)[i]->ShareData(*bottom[0]); @@ -36,15 +36,21 @@ Dtype SplitLayer::Forward_cpu(const vector*>& bottom, } template +<<<<<<< HEAD void SplitLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[0]) { +======= +void SplitLayer::Backward(const vector*>& top, + const bool propagate_down, vector*>* bottom) { + if (propagate_down) { +>>>>>>> Unify the CPU/GPU Forward/Backward of the SplitLayer (*bottom)[0]->ShareDiff(*top[0]); // Add remaining top blob diffs. - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); for (int i = 1; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->cpu_diff(); - caffe_axpy(count_, Dtype(1.), top_diff, bottom_diff); + const Dtype* top_diff = top[i]->const_diff(); + this->device_->axpy(count_, Dtype(1.), top_diff, bottom_diff); } } } From 9d9b1c3d339fcab461a457680c07288aa7ebc964 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Mon, 26 May 2014 09:27:40 +0800 Subject: [PATCH 26/75] Unify the CPU/GPU Forward/Backward of the EltwiseLayer --- include/caffe/vision_layers.hpp | 20 ++------- src/caffe/layers/eltwise_layer.cpp | 32 +++++++------- src/caffe/layers/eltwise_layer.cu | 69 ------------------------------ 3 files changed, 21 insertions(+), 100 deletions(-) delete mode 100644 src/caffe/layers/eltwise_layer.cu diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 1479f52e1fd..17e4fa4b3c3 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -136,6 +136,10 @@ class EltwiseLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const bool propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_ELTWISE; @@ -144,15 +148,6 @@ class EltwiseLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 1; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - EltwiseParameter_EltwiseOp op_; vector coeffs_; }; @@ -402,15 +397,8 @@ class SplitLayer : public Layer { vector*>* top); virtual Dtype Forward(const vector*>& bottom, vector*>* top); -<<<<<<< HEAD - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); -======= virtual void Backward(const vector*>& top, const bool propagate_down, vector*>* bottom); ->>>>>>> Unify the CPU/GPU Forward/Backward of the SplitLayer protected: int count_; diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp index 2c265f6678f..7edfe3720c7 100644 --- a/src/caffe/layers/eltwise_layer.cpp +++ b/src/caffe/layers/eltwise_layer.cpp @@ -41,22 +41,24 @@ void EltwiseLayer::SetUp(const vector*>& bottom, } template -Dtype EltwiseLayer::Forward_cpu( +Dtype EltwiseLayer::Forward( const vector*>& bottom, vector*>* top) { const int count = (*top)[0]->count(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + Dtype* top_data = (*top)[0]->mutable_data(); switch (op_) { case EltwiseParameter_EltwiseOp_PROD: - caffe_mul(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), top_data); + this->device_->mul(count, bottom[0]->const_data(), + bottom[1]->const_data(), top_data); for (int i = 2; i < bottom.size(); ++i) { - caffe_mul(count, top_data, bottom[i]->cpu_data(), top_data); + this->device_->mul(count, top_data, bottom[i]->const_data(), top_data); } break; case EltwiseParameter_EltwiseOp_SUM: - caffe_set(count, Dtype(0), top_data); + this->device_->set(count, Dtype(0), top_data); // TODO(shelhamer) does BLAS optimize to sum for coeff = 1? for (int i = 0; i < bottom.size(); ++i) { - caffe_axpy(count, coeffs_[i], bottom[i]->cpu_data(), top_data); + this->device_->axpy(count, coeffs_[i], bottom[i]->const_data(), + top_data); } break; default: @@ -66,25 +68,25 @@ Dtype EltwiseLayer::Forward_cpu( } template -void EltwiseLayer::Backward_cpu(const vector*>& top, +void EltwiseLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { const int count = top[0]->count(); - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); + const Dtype* top_data = top[0]->const_data(); + const Dtype* top_diff = top[0]->const_diff(); for (int i = 0; i < bottom->size(); ++i) { if (propagate_down[i]) { - const Dtype* bottom_data = (*bottom)[i]->cpu_data(); - Dtype* bottom_diff = (*bottom)[i]->mutable_cpu_diff(); + const Dtype* bottom_data = (*bottom)[i]->const_data(); + Dtype* bottom_diff = (*bottom)[i]->mutable_diff(); switch (op_) { case EltwiseParameter_EltwiseOp_PROD: - caffe_div(count, top_data, bottom_data, bottom_diff); - caffe_mul(count, bottom_diff, top_diff, bottom_diff); + this->device_->div(count, top_data, bottom_data, bottom_diff); + this->device_->mul(count, bottom_diff, top_diff, bottom_diff); break; case EltwiseParameter_EltwiseOp_SUM: if (coeffs_[i] == Dtype(1)) { - caffe_copy(count, top_diff, bottom_diff); + this->device_->copy(count, top_diff, bottom_diff); } else { - caffe_cpu_scale(count, coeffs_[i], top_diff, bottom_diff); + this->device_->scale(count, coeffs_[i], top_diff, bottom_diff); } break; default: diff --git a/src/caffe/layers/eltwise_layer.cu b/src/caffe/layers/eltwise_layer.cu deleted file mode 100644 index 3860944889c..00000000000 --- a/src/caffe/layers/eltwise_layer.cu +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -Dtype EltwiseLayer::Forward_gpu( - const vector*>& bottom, vector*>* top) { - const int count = (*top)[0]->count(); - Dtype* top_data = (*top)[0]->mutable_gpu_data(); - switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: - caffe_gpu_mul(count, bottom[0]->gpu_data(), - bottom[1]->gpu_data(), top_data); - for (int i = 2; i < bottom.size(); ++i) { - caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data); - } - break; - case EltwiseParameter_EltwiseOp_SUM: - caffe_gpu_set(count, Dtype(0.), top_data); - // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1? - for (int i = 0; i < bottom.size(); ++i) { - caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data); - } - break; - default: - LOG(FATAL) << "Unknown elementwise operation."; - } - return Dtype(0.); -} - -template -void EltwiseLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - const int count = top[0]->count(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - for (int i = 0; i < bottom->size(); ++i) { - if (propagate_down[i]) { - const Dtype* bottom_data = (*bottom)[i]->gpu_data(); - Dtype* bottom_diff = (*bottom)[i]->mutable_gpu_diff(); - switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: - caffe_gpu_div(count, top_data, bottom_data, bottom_diff); - caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); - break; - case EltwiseParameter_EltwiseOp_SUM: - if (coeffs_[i] == Dtype(1.)) { - caffe_gpu_copy(count, top_diff, bottom_diff); - } else { - caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff); - } - break; - default: - LOG(FATAL) << "Unknown elementwise operation."; - } - } - } -} - -INSTANTIATE_CLASS(EltwiseLayer); - - -} // namespace caffe From 5d3be7a0d30c0291a0a5d4f663c15ada2e0e8057 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Mon, 26 May 2014 09:37:18 +0800 Subject: [PATCH 27/75] Add im2col and col2im to wrap im2col_{cpu, gpu} and col2im_{cpu, gpu} --- include/caffe/util/im2col.hpp | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp index a649d8cc4e8..e14018fd356 100644 --- a/include/caffe/util/im2col.hpp +++ b/include/caffe/util/im2col.hpp @@ -3,6 +3,8 @@ #ifndef _CAFFE_UTIL_IM2COL_HPP_ #define _CAFFE_UTIL_IM2COL_HPP_ +#include "caffe/common.hpp" + namespace caffe { template @@ -25,6 +27,38 @@ void col2im_gpu(const Dtype* data_col, const int channels, const int height, const int width, const int psize, const int pad, const int stride, Dtype* data_im); +template +inline void im2col(const Dtype* data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col) { + switch (Caffe::mode()) { + case Caffe::CPU: + im2col_cpu(data_im, channels, height, width, ksize, pad, stride, + data_col); + case Caffe::GPU: + im2col_gpu(data_im, channels, height, width, ksize, pad, stride, + data_col); + default: + LOG(FATAL) << "Unknown caffe mode."; + } +} + +template +inline void col2im(const Dtype* data_col, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, Dtype* data_im) { + switch (Caffe::mode()) { + case Caffe::CPU: + im2col_cpu(data_col, channels, height, width, psize, pad, stride, + data_im); + case Caffe::GPU: + im2col_gpu(data_col, channels, height, width, psize, pad, stride, + data_im); + default: + LOG(FATAL) << "Unknown caffe mode."; + } +} + } // namespace caffe #endif // CAFFE_UTIL_IM2COL_HPP_ From b2a90d96cfd4c968fe62bc200b673c2f7a38d8b8 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Mon, 26 May 2014 09:43:11 +0800 Subject: [PATCH 28/75] Unify the CPU/GPU versions of Forward/Backward of the ConvolutionLayer --- include/caffe/vision_layers.hpp | 13 ++-- src/caffe/layers/conv_layer.cpp | 48 +++++++-------- src/caffe/layers/conv_layer.cu | 104 -------------------------------- 3 files changed, 28 insertions(+), 137 deletions(-) delete mode 100644 src/caffe/layers/conv_layer.cu diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 17e4fa4b3c3..2c8397a0d28 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -92,6 +92,10 @@ class ConvolutionLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const bool propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_CONVOLUTION; @@ -100,15 +104,6 @@ class ConvolutionLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 1; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - int kernel_size_; int stride_; int num_; diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 9ec8da47e1a..891097b239f 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -77,29 +77,29 @@ void ConvolutionLayer::SetUp(const vector*>& bottom, template -Dtype ConvolutionLayer::Forward_cpu(const vector*>& bottom, +Dtype ConvolutionLayer::Forward(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); - Dtype* col_data = col_buffer_.mutable_cpu_data(); - const Dtype* weight = this->blobs_[0]->cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); + Dtype* col_data = col_buffer_.mutable_data(); + const Dtype* weight = this->blobs_[0]->const_data(); int weight_offset = M_ * K_; int col_offset = K_ * N_; int top_offset = M_ * N_; for (int n = 0; n < num_; ++n) { // First, im2col - im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, + im2col(bottom_data + bottom[0]->offset(n), channels_, height_, width_, kernel_size_, pad_, stride_, col_data); // Second, innerproduct with groups for (int g = 0; g < group_; ++g) { - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, K_, + this->device_->gemm(CblasNoTrans, CblasNoTrans, M_, N_, K_, (Dtype)1., weight + weight_offset * g, col_data + col_offset * g, (Dtype)0., top_data + (*top)[0]->offset(n) + top_offset * g); } // third, add bias if (bias_term_) { - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, - N_, 1, (Dtype)1., this->blobs_[1]->cpu_data(), + this->device_->gemm(CblasNoTrans, CblasNoTrans, num_output_, + N_, 1, (Dtype)1., this->blobs_[1]->const_data(), reinterpret_cast(bias_multiplier_->cpu_data()), (Dtype)1., top_data + (*top)[0]->offset(n)); } @@ -108,23 +108,23 @@ Dtype ConvolutionLayer::Forward_cpu(const vector*>& bottom, } template -void ConvolutionLayer::Backward_cpu(const vector*>& top, +void ConvolutionLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - const Dtype* weight = this->blobs_[0]->cpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); - const Dtype* bottom_data = (*bottom)[0]->cpu_data(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); - Dtype* col_data = col_buffer_.mutable_cpu_data(); - Dtype* col_diff = col_buffer_.mutable_cpu_diff(); + const Dtype* top_diff = top[0]->const_diff(); + const Dtype* weight = this->blobs_[0]->const_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_diff(); + const Dtype* bottom_data = (*bottom)[0]->const_data(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); + Dtype* col_data = col_buffer_.mutable_data(); + Dtype* col_diff = col_buffer_.mutable_diff(); // bias gradient if necessary Dtype* bias_diff = NULL; if (bias_term_) { - bias_diff = this->blobs_[1]->mutable_cpu_diff(); - memset(bias_diff, 0, sizeof(Dtype) * this->blobs_[1]->count()); + bias_diff = this->blobs_[1]->mutable_diff(); + this->device_->set(this->blobs_[1]->count(), 0, bias_diff); for (int n = 0; n < num_; ++n) { - caffe_cpu_gemv(CblasNoTrans, num_output_, N_, + this->device_->gemv(CblasNoTrans, num_output_, N_, 1., top_diff + top[0]->offset(n), reinterpret_cast(bias_multiplier_->cpu_data()), 1., bias_diff); @@ -134,15 +134,15 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, int weight_offset = M_ * K_; int col_offset = K_ * N_; int top_offset = M_ * N_; - memset(weight_diff, 0, sizeof(Dtype) * this->blobs_[0]->count()); + this->device_->set(this->blobs_[0]->count(), 0, weight_diff); for (int n = 0; n < num_; ++n) { // since we saved memory in the forward pass by not storing all col data, // we will need to recompute them. - im2col_cpu(bottom_data + (*bottom)[0]->offset(n), channels_, height_, + im2col(bottom_data + (*bottom)[0]->offset(n), channels_, height_, width_, kernel_size_, pad_, stride_, col_data); // gradient w.r.t. weight. Note that we will accumulate diffs. for (int g = 0; g < group_; ++g) { - caffe_cpu_gemm(CblasNoTrans, CblasTrans, M_, K_, N_, + this->device_->gemm(CblasNoTrans, CblasTrans, M_, K_, N_, (Dtype)1., top_diff + top[0]->offset(n) + top_offset * g, col_data + col_offset * g, (Dtype)1., weight_diff + weight_offset * g); @@ -150,7 +150,7 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, // gradient w.r.t. bottom data, if necessary if (propagate_down[0]) { for (int g = 0; g < group_; ++g) { - caffe_cpu_gemm(CblasTrans, CblasNoTrans, K_, N_, M_, + this->device_->gemm(CblasTrans, CblasNoTrans, K_, N_, M_, (Dtype)1., weight + weight_offset * g, top_diff + top[0]->offset(n) + top_offset * g, (Dtype)0., col_diff + col_offset * g); diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu deleted file mode 100644 index 85f95fd32c9..00000000000 --- a/src/caffe/layers/conv_layer.cu +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/filler.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -Dtype ConvolutionLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = (*top)[0]->mutable_gpu_data(); - Dtype* col_data = col_buffer_.mutable_gpu_data(); - const Dtype* weight = this->blobs_[0]->gpu_data(); - int weight_offset = M_ * K_; - int col_offset = K_ * N_; - int top_offset = M_ * N_; - for (int n = 0; n < num_; ++n) { - // First, im2col - im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_, - width_, kernel_size_, pad_, stride_, col_data); - // Second, innerproduct with groups - for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, K_, - (Dtype)1., weight + weight_offset * g, col_data + col_offset * g, - (Dtype)0., top_data + (*top)[0]->offset(n) + top_offset * g); - } - // third, add bias - if (bias_term_) { - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, - N_, 1, (Dtype)1., this->blobs_[1]->gpu_data(), - reinterpret_cast(bias_multiplier_->gpu_data()), - (Dtype)1., top_data + (*top)[0]->offset(n)); - } - } - return Dtype(0.); -} - -template -void ConvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* weight = this->blobs_[0]->gpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - const Dtype* bottom_data = (*bottom)[0]->gpu_data(); - Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff(); - Dtype* col_data = col_buffer_.mutable_gpu_data(); - Dtype* col_diff = col_buffer_.mutable_gpu_diff(); - // bias gradient if necessary - Dtype* bias_diff = NULL; - - if (bias_term_) { - bias_diff = this->blobs_[1]->mutable_gpu_diff(); - CUDA_CHECK(cudaMemset(bias_diff, 0, - sizeof(Dtype) * this->blobs_[1]->count())); - for (int n = 0; n < num_; ++n) { - caffe_gpu_gemv(CblasNoTrans, num_output_, N_, - 1., top_diff + top[0]->offset(n), - reinterpret_cast(bias_multiplier_->gpu_data()), - 1., bias_diff); - } - } - - int weight_offset = M_ * K_; - int col_offset = K_ * N_; - int top_offset = M_ * N_; - CUDA_CHECK(cudaMemset(weight_diff, 0, - sizeof(Dtype) * this->blobs_[0]->count())); - for (int n = 0; n < num_; ++n) { - // since we saved memory in the forward pass by not storing all col data, - // we will need to recompute them. - im2col_gpu(bottom_data + (*bottom)[0]->offset(n), channels_, height_, - width_, kernel_size_, pad_, stride_, col_data); - // gradient w.r.t. weight. Note that we will accumulate diffs. - for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm(CblasNoTrans, CblasTrans, M_, K_, N_, - (Dtype)1., top_diff + top[0]->offset(n) + top_offset * g, - col_data + col_offset * g, (Dtype)1., - weight_diff + weight_offset * g); - } - // gradient w.r.t. bottom data, if necessary - if (propagate_down[0]) { - for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm(CblasTrans, CblasNoTrans, K_, N_, M_, - (Dtype)1., weight + weight_offset * g, - top_diff + top[0]->offset(n) + top_offset * g, - (Dtype)0., col_diff + col_offset * g); - } - // col2im back to the data - col2im_gpu(col_diff, channels_, height_, width_, kernel_size_, pad_, - stride_, bottom_diff + (*bottom)[0]->offset(n)); - } - } -} - - -INSTANTIATE_CLASS(ConvolutionLayer); - -} // namespace caffe From d622419d969e0e4774dd8c1ef330de3e6f45c4f5 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Mon, 26 May 2014 09:51:12 +0800 Subject: [PATCH 29/75] Unify the CPU/GPU versions of Forward/Backward of the Im2colLayer --- include/caffe/vision_layers.hpp | 13 ++++------- src/caffe/layers/im2col_layer.cpp | 16 ++++++------- src/caffe/layers/im2col_layer.cu | 38 ------------------------------- 3 files changed, 12 insertions(+), 55 deletions(-) delete mode 100644 src/caffe/layers/im2col_layer.cu diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 2c8397a0d28..1e117c714d2 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -180,6 +180,10 @@ class Im2colLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const bool propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_IM2COL; @@ -188,15 +192,6 @@ class Im2colLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 1; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - int kernel_size_; int stride_; int channels_; diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp index e047dfb80a7..28321b296b4 100644 --- a/src/caffe/layers/im2col_layer.cpp +++ b/src/caffe/layers/im2col_layer.cpp @@ -25,24 +25,24 @@ void Im2colLayer::SetUp(const vector*>& bottom, } template -Dtype Im2colLayer::Forward_cpu(const vector*>& bottom, +Dtype Im2colLayer::Forward(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); for (int n = 0; n < bottom[0]->num(); ++n) { - im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, + im2col(bottom_data + bottom[0]->offset(n), channels_, height_, width_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n)); } return Dtype(0.); } template -void Im2colLayer::Backward_cpu(const vector*>& top, +void Im2colLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + const Dtype* top_diff = top[0]->const_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); for (int n = 0; n < top[0]->num(); ++n) { - col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_, + col2im(top_diff + top[0]->offset(n), channels_, height_, width_, kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n)); } } diff --git a/src/caffe/layers/im2col_layer.cu b/src/caffe/layers/im2col_layer.cu deleted file mode 100644 index 9cfb74e815c..00000000000 --- a/src/caffe/layers/im2col_layer.cu +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include - -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/common.hpp" - -namespace caffe { - -template -Dtype Im2colLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = (*top)[0]->mutable_gpu_data(); - for (int n = 0; n < bottom[0]->num(); ++n) { - im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_, - width_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n)); - } - return Dtype(0.); -} - -template -void Im2colLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff(); - for (int n = 0; n < top[0]->num(); ++n) { - col2im_gpu(top_diff + top[0]->offset(n), channels_, height_, width_, - kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n)); - } -} - - -INSTANTIATE_CLASS(Im2colLayer); - -} // namespace caffe From 5089d44dc721c44425eba9de5fc345955a0c5d27 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Mon, 26 May 2014 09:58:04 +0800 Subject: [PATCH 30/75] Unify the CPU/GPU versions of Forward/Backward of the PowerLayer --- include/caffe/neuron_layers.hpp | 13 ++--- src/caffe/layers/power_layer.cpp | 50 ++++++++--------- src/caffe/layers/power_layer.cu | 92 -------------------------------- 3 files changed, 29 insertions(+), 126 deletions(-) delete mode 100644 src/caffe/layers/power_layer.cu diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp index e52e395e24b..86eb15660f8 100644 --- a/include/caffe/neuron_layers.hpp +++ b/include/caffe/neuron_layers.hpp @@ -120,21 +120,16 @@ class PowerLayer : public NeuronLayer { : NeuronLayer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const bool propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_POWER; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - Dtype power_; Dtype scale_; Dtype shift_; diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp index 5ff3392968e..f596cd83d06 100644 --- a/src/caffe/layers/power_layer.cpp +++ b/src/caffe/layers/power_layer.cpp @@ -23,78 +23,78 @@ void PowerLayer::SetUp(const vector*>& bottom, // Compute y = (shift + scale * x)^power template -Dtype PowerLayer::Forward_cpu(const vector*>& bottom, +Dtype PowerLayer::Forward(const vector*>& bottom, vector*>* top) { - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + Dtype* top_data = (*top)[0]->mutable_data(); const int count = bottom[0]->count(); // Special case where we can ignore the input: scale or power is 0. if (diff_scale_ == Dtype(0)) { Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); - caffe_set(count, value, top_data); + this->device_->set(count, value, top_data); return Dtype(0); } - const Dtype* bottom_data = bottom[0]->cpu_data(); - caffe_copy(count, bottom_data, top_data); + const Dtype* bottom_data = bottom[0]->const_data(); + this->device_->copy(count, bottom_data, top_data); if (scale_ != Dtype(1)) { - caffe_scal(count, scale_, top_data); + this->device_->scal(count, scale_, top_data); } if (shift_ != Dtype(0)) { - caffe_add_scalar(count, shift_, top_data); + this->device_->add_scalar(count, shift_, top_data); } if (power_ != Dtype(1)) { - caffe_powx(count, top_data, power_, top_data); + this->device_->powx(count, top_data, power_, top_data); } return Dtype(0); } template -void PowerLayer::Backward_cpu(const vector*>& top, +void PowerLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[0]) { - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); const int count = (*bottom)[0]->count(); - const Dtype* top_diff = top[0]->cpu_diff(); + const Dtype* top_diff = top[0]->const_diff(); if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { - caffe_set(count, diff_scale_, bottom_diff); + this->device_->set(count, diff_scale_, bottom_diff); } else { - const Dtype* bottom_data = (*bottom)[0]->cpu_data(); + const Dtype* bottom_data = (*bottom)[0]->const_data(); // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) // = diff_scale * y / (shift + scale * x) if (power_ == Dtype(2)) { // Special case for y = (shift + scale * x)^2 // -> dy/dx = 2 * scale * (shift + scale * x) // = diff_scale * shift + diff_scale * scale * x - caffe_cpu_axpby(count, diff_scale_ * scale_, bottom_data, + this->device_->axpby(count, diff_scale_ * scale_, bottom_data, Dtype(0), bottom_diff); if (shift_ != Dtype(0)) { - caffe_add_scalar(count, diff_scale_ * shift_, bottom_diff); + this->device_->add_scalar(count, diff_scale_ * shift_, bottom_diff); } } else if (shift_ == Dtype(0)) { // Special case for y = (scale * x)^power // -> dy/dx = scale * power * (scale * x)^(power - 1) // = scale * power * (scale * x)^power * (scale * x)^(-1) // = power * y / x - const Dtype* top_data = top[0]->cpu_data(); - caffe_div(count, top_data, bottom_data, bottom_diff); - caffe_scal(count, power_, bottom_diff); + const Dtype* top_data = top[0]->const_data(); + this->device_->div(count, top_data, bottom_data, bottom_diff); + this->device_->scal(count, power_, bottom_diff); } else { - caffe_copy(count, bottom_data, bottom_diff); + this->device_->copy(count, bottom_data, bottom_diff); if (scale_ != Dtype(1)) { - caffe_scal(count, scale_, bottom_diff); + this->device_->scal(count, scale_, bottom_diff); } if (shift_ != Dtype(0)) { - caffe_add_scalar(count, shift_, bottom_diff); + this->device_->add_scalar(count, shift_, bottom_diff); } - const Dtype* top_data = top[0]->cpu_data(); - caffe_div(count, top_data, bottom_diff, bottom_diff); + const Dtype* top_data = top[0]->const_data(); + this->device_->div(count, top_data, bottom_diff, bottom_diff); if (diff_scale_ != Dtype(1)) { - caffe_scal(count, diff_scale_, bottom_diff); + this->device_->scal(count, diff_scale_, bottom_diff); } } } if (diff_scale_ != Dtype(0)) { - caffe_mul(count, top_diff, bottom_diff, bottom_diff); + this->device_->mul(count, top_diff, bottom_diff, bottom_diff); } } } diff --git a/src/caffe/layers/power_layer.cu b/src/caffe/layers/power_layer.cu deleted file mode 100644 index 6d699636e21..00000000000 --- a/src/caffe/layers/power_layer.cu +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" - -using std::max; - -namespace caffe { - -template -Dtype PowerLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - Dtype* top_data = (*top)[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // Special case where we can ignore the input: scale or power is 0. - if (diff_scale_ == Dtype(0)) { - Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); - caffe_gpu_set(count, value, top_data); - return Dtype(0); - } - const Dtype* bottom_data = bottom[0]->gpu_data(); - caffe_gpu_copy(count, bottom_data, top_data); - if (scale_ != Dtype(1)) { - caffe_gpu_scal(count, scale_, top_data); - } - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, shift_, top_data); - } - if (power_ != Dtype(1)) { - caffe_gpu_powx(count, top_data, power_, top_data); - } - return Dtype(0); -} - -template -void PowerLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - vector*>* bottom) { - if (propagate_down[0]) { - Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff(); - const int count = (*bottom)[0]->count(); - const Dtype* top_diff = top[0]->gpu_diff(); - if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { - caffe_gpu_set(count, diff_scale_, bottom_diff); - } else { - const Dtype* bottom_data = (*bottom)[0]->gpu_data(); - // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) - // = diff_scale * y / (shift + scale * x) - if (power_ == Dtype(2)) { - // Special case for y = (shift + scale * x)^2 - // -> dy/dx = 2 * scale * (shift + scale * x) - // = diff_scale * shift + diff_scale * scale * x - caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data, - Dtype(0), bottom_diff); - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff); - } - } else if (shift_ == Dtype(0)) { - // Special case for y = (scale * x)^power - // -> dy/dx = scale * power * (scale * x)^(power - 1) - // = scale * power * (scale * x)^power * (scale * x)^(-1) - // = power * y / x - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_div(count, top_data, bottom_data, bottom_diff); - caffe_gpu_scal(count, power_, bottom_diff); - } else { - caffe_gpu_copy(count, bottom_data, bottom_diff); - if (scale_ != Dtype(1)) { - caffe_gpu_scal(count, scale_, bottom_diff); - } - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, shift_, bottom_diff); - } - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_div(count, top_data, bottom_diff, bottom_diff); - if (diff_scale_ != Dtype(1)) { - caffe_gpu_scal(count, diff_scale_, bottom_diff); - } - } - } - caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); - } -} - -INSTANTIATE_CLASS(PowerLayer); - - -} // namespace caffe From 5c468a16b77bc8873141515fe8a081c33c582235 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Mon, 26 May 2014 10:05:33 +0800 Subject: [PATCH 31/75] Move im2col and col2im into the device wrapper classes --- include/caffe/util/device.hpp | 25 ++++++++++++++++++++++++ include/caffe/util/im2col.hpp | 32 ------------------------------- src/caffe/layers/conv_layer.cpp | 13 ++++++++----- src/caffe/layers/im2col_layer.cpp | 6 ++++-- src/caffe/util/cpu_device.cpp | 16 ++++++++++++++++ src/caffe/util/gpu_device.cpp | 16 ++++++++++++++++ 6 files changed, 69 insertions(+), 39 deletions(-) diff --git a/include/caffe/util/device.hpp b/include/caffe/util/device.hpp index e9d95b6c9ce..8bc417ee561 100644 --- a/include/caffe/util/device.hpp +++ b/include/caffe/util/device.hpp @@ -8,6 +8,7 @@ #include "glog/logging.h" +#include "caffe/util/im2col.hpp" #include "caffe/util/math_functions.hpp" namespace caffe { @@ -79,6 +80,14 @@ class Device { virtual void scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y) = 0; + + virtual void im2col(const Dtype* data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col) = 0; + + virtual void col2im(const Dtype* data_col, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, Dtype* data_im) = 0; }; template @@ -146,6 +155,14 @@ class CPUDevice : public Device { virtual void fabs(const int N, const Dtype* x, Dtype* y); virtual void scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y); + + virtual void im2col(const Dtype* data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col); + + virtual void col2im(const Dtype* data_col, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, Dtype* data_im); }; template @@ -213,6 +230,14 @@ class GPUDevice : public Device { virtual void fabs(const int N, const Dtype* x, Dtype* y); virtual void scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y); + + virtual void im2col(const Dtype* data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col); + + virtual void col2im(const Dtype* data_col, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, Dtype* data_im); }; template diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp index e14018fd356..809308caf80 100644 --- a/include/caffe/util/im2col.hpp +++ b/include/caffe/util/im2col.hpp @@ -27,38 +27,6 @@ void col2im_gpu(const Dtype* data_col, const int channels, const int height, const int width, const int psize, const int pad, const int stride, Dtype* data_im); -template -inline void im2col(const Dtype* data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col) { - switch (Caffe::mode()) { - case Caffe::CPU: - im2col_cpu(data_im, channels, height, width, ksize, pad, stride, - data_col); - case Caffe::GPU: - im2col_gpu(data_im, channels, height, width, ksize, pad, stride, - data_col); - default: - LOG(FATAL) << "Unknown caffe mode."; - } -} - -template -inline void col2im(const Dtype* data_col, const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, Dtype* data_im) { - switch (Caffe::mode()) { - case Caffe::CPU: - im2col_cpu(data_col, channels, height, width, psize, pad, stride, - data_im); - case Caffe::GPU: - im2col_gpu(data_col, channels, height, width, psize, pad, stride, - data_im); - default: - LOG(FATAL) << "Unknown caffe mode."; - } -} - } // namespace caffe #endif // CAFFE_UTIL_IM2COL_HPP_ diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 891097b239f..02871c29d05 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -88,8 +88,9 @@ Dtype ConvolutionLayer::Forward(const vector*>& bottom, int top_offset = M_ * N_; for (int n = 0; n < num_; ++n) { // First, im2col - im2col(bottom_data + bottom[0]->offset(n), channels_, height_, - width_, kernel_size_, pad_, stride_, col_data); + this->device_->im2col( + bottom_data + bottom[0]->offset(n), channels_, height_, + width_, kernel_size_, pad_, stride_, col_data); // Second, innerproduct with groups for (int g = 0; g < group_; ++g) { this->device_->gemm(CblasNoTrans, CblasNoTrans, M_, N_, K_, @@ -138,8 +139,9 @@ void ConvolutionLayer::Backward(const vector*>& top, for (int n = 0; n < num_; ++n) { // since we saved memory in the forward pass by not storing all col data, // we will need to recompute them. - im2col(bottom_data + (*bottom)[0]->offset(n), channels_, height_, - width_, kernel_size_, pad_, stride_, col_data); + this->device_->im2col( + bottom_data + (*bottom)[0]->offset(n), channels_, height_, + width_, kernel_size_, pad_, stride_, col_data); // gradient w.r.t. weight. Note that we will accumulate diffs. for (int g = 0; g < group_; ++g) { this->device_->gemm(CblasNoTrans, CblasTrans, M_, K_, N_, @@ -156,7 +158,8 @@ void ConvolutionLayer::Backward(const vector*>& top, (Dtype)0., col_diff + col_offset * g); } // col2im back to the data - col2im_cpu(col_diff, channels_, height_, width_, kernel_size_, pad_, + this->device_->col2im( + col_diff, channels_, height_, width_, kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n)); } } diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp index 28321b296b4..26585e67ddc 100644 --- a/src/caffe/layers/im2col_layer.cpp +++ b/src/caffe/layers/im2col_layer.cpp @@ -30,7 +30,8 @@ Dtype Im2colLayer::Forward(const vector*>& bottom, const Dtype* bottom_data = bottom[0]->const_data(); Dtype* top_data = (*top)[0]->mutable_data(); for (int n = 0; n < bottom[0]->num(); ++n) { - im2col(bottom_data + bottom[0]->offset(n), channels_, height_, + this->device_->im2col( + bottom_data + bottom[0]->offset(n), channels_, height_, width_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n)); } return Dtype(0.); @@ -42,7 +43,8 @@ void Im2colLayer::Backward(const vector*>& top, const Dtype* top_diff = top[0]->const_diff(); Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); for (int n = 0; n < top[0]->num(); ++n) { - col2im(top_diff + top[0]->offset(n), channels_, height_, width_, + this->device_->col2im( + top_diff + top[0]->offset(n), channels_, height_, width_, kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n)); } } diff --git a/src/caffe/util/cpu_device.cpp b/src/caffe/util/cpu_device.cpp index 6c8e5f42f47..72e3af12025 100644 --- a/src/caffe/util/cpu_device.cpp +++ b/src/caffe/util/cpu_device.cpp @@ -154,6 +154,22 @@ void CPUDevice::scale(const int N, const Dtype alpha, caffe_cpu_scale(N, alpha, x, y); } +template +void CPUDevice::im2col(const Dtype* data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col) { + im2col_cpu(data_im, channels, height, width, ksize, pad, stride, + data_col); +} + +template +void CPUDevice::col2im(const Dtype* data_col, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, Dtype* data_im) { + col2im_cpu(data_col, channels, height, width, psize, pad, stride, + data_im); +} + INSTANTIATE_CLASS(CPUDevice); } // namespace caffe diff --git a/src/caffe/util/gpu_device.cpp b/src/caffe/util/gpu_device.cpp index c388f5f10b7..d5083801b55 100644 --- a/src/caffe/util/gpu_device.cpp +++ b/src/caffe/util/gpu_device.cpp @@ -157,6 +157,22 @@ void GPUDevice::scale(const int N, const Dtype alpha, caffe_gpu_scale(N, alpha, x, y); } +template +void GPUDevice::im2col(const Dtype* data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col) { + im2col_gpu(data_im, channels, height, width, ksize, pad, stride, + data_col); +} + +template +void GPUDevice::col2im(const Dtype* data_col, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, Dtype* data_im) { + col2im_gpu(data_col, channels, height, width, psize, pad, stride, + data_im); +} + INSTANTIATE_CLASS(GPUDevice); } // namespace caffe From 26629878d8b84f5671aa5fe74646eec681e72d97 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 14:35:25 +0800 Subject: [PATCH 32/75] Update the include guard of the util/device.hpp --- include/caffe/util/device.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/caffe/util/device.hpp b/include/caffe/util/device.hpp index 8bc417ee561..c7cc21baa66 100644 --- a/include/caffe/util/device.hpp +++ b/include/caffe/util/device.hpp @@ -1,7 +1,7 @@ // Copyright 2014 BVLC and contributors. -#ifndef CAFFE_UTIL_MATH_BACKENDS_H_ -#define CAFFE_UTIL_MATH_BACKENDS_H_ +#ifndef CAFFE_UTIL_DEVICE_H_ +#define CAFFE_UTIL_DEVICE_H_ #include #include @@ -251,4 +251,4 @@ class DeviceFactory { } // namespace caffe -#endif // CAFFE_UTIL_MATH_BACKENDS_H_ +#endif // CAFFE_UTIL_DEVICE_H_ From 8c3d26a5de589b6f8655270f793fe2708e659740 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 15:53:08 +0800 Subject: [PATCH 33/75] Add OpenCLDevice header file and to_clblasTranspose inline function --- include/caffe/util/opencl_device.hpp | 109 +++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 include/caffe/util/opencl_device.hpp diff --git a/include/caffe/util/opencl_device.hpp b/include/caffe/util/opencl_device.hpp new file mode 100644 index 00000000000..0b25b99bf82 --- /dev/null +++ b/include/caffe/util/opencl_device.hpp @@ -0,0 +1,109 @@ +// Copyright 2014 BVLC and contributors. + +#ifndef CAFFE_UTIL_OPENCL_DEVICE_H_ +#define CAFFE_UTIL_OPENCL_DEVICE_H_ + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#include "glog/logging.h" + +#include "caffe/util/device.hpp" + +namespace caffe { + +inline clblasTranspose to_clblasTranspose(const CBLAS_TRANSPOSE trans) { + switch (trans) { + case CblasNoTrans: + return clblasNoTrans; + case CblasTrans: + return clblasTrans; + case CblasConjTrans: + return clblasConjTrans; + default: + LOG(FATAL) << "Unknown CBLAS_TRANSPOSE " << trans; + } +} + +template +class OpenCLDevice : public Device { + public: + OpenCLDevice() { + } + virtual ~OpenCLDevice() { + } + virtual void gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const Dtype alpha, + const Dtype* A, const Dtype* B, const Dtype beta, Dtype* C); + + virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, const Dtype* x, + const Dtype beta, Dtype* y); + + virtual void axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y); + + virtual void axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y); + + virtual void copy(const int N, const Dtype *X, Dtype *Y); + virtual void copy_from_cpu(const int N, const Dtype* X, Dtype* Y); + + virtual void set(const int N, const Dtype alpha, Dtype *X); + + virtual void add_scalar(const int N, const Dtype alpha, Dtype *X); + + virtual void scal(const int N, const Dtype alpha, Dtype *X); + + virtual void sqr(const int N, const Dtype* a, Dtype* y); + + virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y); + + virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y); + + virtual void rng_uniform(const int N, const Dtype a, const Dtype b, Dtype* r); + + virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma, + Dtype* r); + + virtual void rng_bernoulli(const int N, const Dtype p, int* r); + + virtual void exp(const int N, const Dtype* a, Dtype* y); + + virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out); + + virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y, + uint32_t* out); + +// Returns the sum of the absolute values of the elements of vector x + virtual void asum(const int N, const Dtype* x, Dtype* y); + + virtual void sign(const int N, const Dtype* x, Dtype* y); + + virtual void sgnbit(const int N, const Dtype* x, Dtype* y); + + virtual void fabs(const int N, const Dtype* x, Dtype* y); + + virtual void scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y); + + virtual void im2col(const Dtype* data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col); + + virtual void col2im(const Dtype* data_col, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, Dtype* data_im); +}; + + +} // namespace caffe + +#endif // CAFFE_UTIL_OPENCL_DEVICE_H_ From 1a2f04c2f481275ec7b5821307b0bf01dd74603a Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 17:01:20 +0800 Subject: [PATCH 34/75] Add macros and get error string functions for the OpenCL device --- include/caffe/util/opencl_device.hpp | 39 ++++++++++ src/caffe/util/opencl_device.cpp | 104 +++++++++++++++++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 src/caffe/util/opencl_device.cpp diff --git a/include/caffe/util/opencl_device.hpp b/include/caffe/util/opencl_device.hpp index 0b25b99bf82..306cf0a1a6f 100644 --- a/include/caffe/util/opencl_device.hpp +++ b/include/caffe/util/opencl_device.hpp @@ -15,6 +15,45 @@ namespace caffe { +#define CL_CHECK(condition) \ + /* Code block avoids redefinition of cudaError_t error */ \ + do { \ + cl_int error = condition; \ + CHECK_EQ(error, CL_SUCCESS) << " " << clGetErrorString(error); \ + } while (0) + +#define CLBLAS_CHECK(condition) \ + do { \ + clblasStatus_t status = condition; \ + CHECK_EQ(status, clblasSuccess) << " " \ + << caffe::clblasGetErrorString(status); \ + } while (0) + +#define CREATE_CL_MEM(A, M, K, FLAG) \ + do { \ + cl_int error; + cl_mem buf##A = clCreateBuffer( \ + Caffe::opencl_context(), CL_MEM_##FLAG, M * K * sizeof(*A), \ + NULL, &error); \ + CL_CHECK(error); \ + } while(0) + +#define RELEASE_CL_MEM(A) \ clReleaseMemObject(buf##A) + +#define ENQUEUE_CL_BUFFER(FLAG, A, M, K) \ + CLBLAS_CHECK(clEnqueue##FLAG##Buffer( + Caffe::opencl_queue(), bufA, CL_TRUE, 0, M * K * sizeof(*A), + A, 0, NULL, NULL)); + +#define PRE_CLBLAS_CALL \ + cl_uint numCommandQueues = 1; \ + cl_uint numEventsInWaitList = 0; \ + cl_event *eventWaitList = NULL; \ + cl_event events = NULL + +const char* clGetErrorString(cl_int error); +const char* clblasGetErrorString(clblasStatus_t status); + inline clblasTranspose to_clblasTranspose(const CBLAS_TRANSPOSE trans) { switch (trans) { case CblasNoTrans: diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp new file mode 100644 index 00000000000..d21420174a6 --- /dev/null +++ b/src/caffe/util/opencl_device.cpp @@ -0,0 +1,104 @@ +// Copyright 2014 BVLC and contributors. + +#include "caffe/common.hpp" +#include "caffe/util/opencl_device.hpp" + +namespace caffe { + +const char* clGetErrorString(cl_int error) { + switch (error) { + case CL_SUCCESS: + return "CL_SUCCESS"; + case CL_INVALID_VALUE: + return "CL_INVALID_VALUE"; + case CL_INVALID_COMMAND_QUEUE: + return "CL_INVALID_COMMAND_QUEUE"; + case CL_INVALID_CONTEXT: + return "CL_INVALID_CONTEXT"; + case CL_INVALID_MEM_OBJECT: + return "CL_INVALID_MEM_OBJECT"; + case CL_INVALID_DEVICE: + return "CL_INVALID_DEVICE"; + case CL_INVALID_EVENT_WAIT_LIST: + return "CL_INVALID_EVENT_WAIT_LIST"; + case CL_OUT_OF_RESOURCES: + return "CL_OUT_OF_RESOURCES"; + case CL_OUT_OF_HOST_MEMORY: + return "CL_OUT_OF_HOST_MEMORY"; + case CL_INVALID_OPERATION: + return "CL_INVALID_OPERATION"; + case CL_COMPILER_NOT_AVAILABLE: + return "CL_COMPILER_NOT_AVAILABLE"; + case CL_BUILD_PROGRAM_FAILURE: + return "CL_BUILD_PROGRAM_FAILURE"; + } + return "Unknown OpenCL error"; +} + +const char* clblasGetErrorString(clblasStatus_t status) { + switch (status) { + case clblasSuccess: + return "clblasSuccess"; + case clblasInvalidValue: + return "clblasInvalidValue"; + case clblasInvalidCommandQueue: + return "clblasInvalidCommandQueue"; + case clblasInvalidContext: + return "clblasInvalidContext"; + case clblasInvalidMemObject: + return "clblasInvalidMemObject"; + case clblasInvalidDevice: + return "clblasInvalidDevice"; + case clblasInvalidEventWaitList: + return "clblasInvalidEventWaitList"; + case clblasOutOfResources: + return "clblasOutOfResources"; + case clblasOutOfHostMemory: + return "clblasOutOfHostMemory"; + case clblasInvalidOperation: + return "clblasInvalidOperation"; + case clblasCompilerNotAvailable: + return "clblasCompilerNotAvailable"; + case clblasBuildProgramFailure: + return "clblasBuildProgramFailure"; + case clblasNotImplemented: + return "clblasNotImplemented"; + case clblasNotInitialized: + return "clblasNotInitialized"; + case clblasInvalidMatA: + return "clblasInvalidMatA"; + case clblasInvalidMatB: + return "clblasInvalidMatB"; + case clblasInvalidMatC: + return "clblasInvalidMatC"; + case clblasInvalidVecX: + return "clblasInvalidVecX"; + case clblasInvalidVecY: + return "clblasInvalidVecY"; + case clblasInvalidDim: + return "clblasInvalidDim"; + case clblasInvalidLeadDimA: + return "clblasInvalidLeadDimA"; + case clblasInvalidLeadDimB: + return "clblasInvalidLeadDimB"; + case clblasInvalidLeadDimC: + return "clblasInvalidLeadDimC"; + case clblasInvalidIncX: + return "clblasInvalidIncX"; + case clblasInvalidIncY: + return "clblasInvalidIncY"; + case clblasInsufficientMemMatA: + return "clblasInsufficientMemMatA"; + case clblasInsufficientMemMatB: + return "clblasInsufficientMemMatB"; + case clblasInsufficientMemMatC: + return "clblasInsufficientMemMatC"; + case clblasInsufficientMemVecX: + return "clblasInsufficientMemVecX"; + case clblasInsufficientMemVecY: + return "clblasInsufficientMemVecY"; + } + return "Unknown clblas status"; +} + +} // namespace caffe From dbb0cc5b709e94fae2a7e5ee7a6cc49dc2a3e702 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 17:12:31 +0800 Subject: [PATCH 35/75] Implement OpenCLDevice::gemm --- src/caffe/util/opencl_device.cpp | 192 +++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index d21420174a6..8149997cde2 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -5,6 +5,198 @@ namespace caffe { +template +void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, + const int N, const int K, const Dtype alpha, + const Dtype* A, const Dtype* B, + const Dtype beta, Dtype* C) { + // Note that cublas follows fortran order. + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = (TransA == CblasNoTrans) ? N : M; + clblasTranspose clTransA = to_clblasTranspose(TransA); + clblasTranspose clTransB = to_clblasTranspose(TransB); + CREATE_CL_MEM(A, M, K, READ_ONLY); + CREATE_CL_MEM(B, K, N, READ_ONLY); + CREATE_CL_MEM(C, M, N, READ_WRITE); + ENQUEUE_CL_BUFFER(Write, A, M, K); + ENQUEUE_CL_BUFFER(Write, B, K, N); + ENQUEUE_CL_BUFFER(Write, C, M, N); + PRE_CLBLAS_CALL; + // bufX is defined by the macro CREATE_CL_MEM(X, ...) + CLBLAS_CHECK(clblasSgemm(clblasRowMajor, clTransA, clTransB, + M, N, K, &alpha, bufA, 0, lda, bufB, 0, ldb, &beta, bufC, 0, ldc + numCommandQueues, Caffe::opencl_queue(), numEventsInWaitList, + eventWaitList, &events)); + /* Release OpenCL memory objects. */ + RELEASE_CL_MEM(C); + RELEASE_CL_MEM(B); + RELEASE_CL_MEM(A); +} + +template +void OpenCLDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const Dtype alpha, const Dtype* A, + const Dtype* x, const Dtype beta, Dtype* y) { + caffe_gpu_gemv(TransA, M, N, alpha, A, x, beta, y); +} + +template +void OpenCLDevice::axpy(const int N, const Dtype alpha, const Dtype* X, + Dtype* Y) { + caffe_gpu_axpy(N, alpha, X, Y); +} + +template +void OpenCLDevice::axpby(const int N, const Dtype alpha, + const Dtype* X, const Dtype beta, Dtype* Y) { + caffe_gpu_axpby(N, alpha, X, beta, Y); +} + +template +void OpenCLDevice::copy(const int N, const Dtype *X, Dtype *Y) { + caffe_gpu_copy(N, X, Y); +} + +template +void OpenCLDevice::copy_from_cpu(const int N, const Dtype *X, Dtype *Y) { + CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyHostToDevice)); +} + +template +void OpenCLDevice::set(const int N, const Dtype alpha, Dtype *X) { + caffe_gpu_set(N, alpha, X); +} + +template +void OpenCLDevice::add_scalar(const int N, const Dtype alpha, + Dtype *X) { + caffe_gpu_add_scalar(N, alpha, X); +} + +template +void OpenCLDevice::scal(const int N, const Dtype alpha, Dtype *X) { + caffe_gpu_scal(N, alpha, X); +} + +template +void OpenCLDevice::sqr(const int N, const Dtype* a, Dtype* y) { + NOT_IMPLEMENTED; +// caffe_gpu_sqr(N, a, y); +} + +template +void OpenCLDevice::add(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + caffe_gpu_add(N, a, b, y); +} + +template +void OpenCLDevice::sub(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + caffe_gpu_sub(N, a, b, y); +} + +template +void OpenCLDevice::mul(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + caffe_gpu_mul(N, a, b, y); +} + +template +void OpenCLDevice::div(const int N, const Dtype* a, const Dtype* b, + Dtype* y) { + caffe_gpu_div(N, a, b, y); +} + +template +void OpenCLDevice::powx(const int N, const Dtype* a, const Dtype b, + Dtype* y) { + caffe_gpu_powx(N, a, b, y); +} + +template +void OpenCLDevice::rng_uniform(const int N, const Dtype a, + const Dtype b, Dtype* r) { + caffe_gpu_rng_uniform(N, a, b, r); +} + +template +void OpenCLDevice::rng_gaussian(const int N, const Dtype mu, + const Dtype sigma, Dtype* r) { + caffe_gpu_rng_gaussian(N, mu, sigma, r); +} + +template +void OpenCLDevice::rng_bernoulli(const int N, const Dtype p, int* r) { + NOT_IMPLEMENTED; +// caffe_gpu_rng_bernoulli(N, p, r); +} + +template +void OpenCLDevice::exp(const int N, const Dtype* a, Dtype* y) { + NOT_IMPLEMENTED; +// caffe_gpu_exp(N, a, y); +} + +template +void OpenCLDevice::dot(const int N, const Dtype* x, const Dtype* y, + Dtype* out) { + caffe_gpu_dot(N, x, y, out); +} + +template +void OpenCLDevice::hamming_distance(const int N, const Dtype* x, + const Dtype* y, uint32_t* out) { + *out = caffe_gpu_hamming_distance(N, x, y); +} + +template +// Returns the sum of the absolute values of the elements of vector x +void OpenCLDevice::asum(const int N, const Dtype* x, Dtype* y) { + caffe_gpu_asum(N, x, y); +} + +template +void OpenCLDevice::sign(const int N, const Dtype* x, Dtype* y) { + caffe_gpu_sign(N, x, y); +} + +template +void OpenCLDevice::sgnbit(const int N, const Dtype* x, Dtype* y) { + caffe_gpu_sgnbit(N, x, y); +} + +template +void OpenCLDevice::fabs(const int N, const Dtype* x, Dtype* y) { + caffe_gpu_fabs(N, x, y); +} + +template +void OpenCLDevice::scale(const int N, const Dtype alpha, + const Dtype *x, Dtype* y) { + caffe_gpu_scale(N, alpha, x, y); +} + +template +void OpenCLDevice::im2col(const Dtype* data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, Dtype* data_col) { + im2col_gpu(data_im, channels, height, width, ksize, pad, stride, + data_col); +} + +template +void OpenCLDevice::col2im(const Dtype* data_col, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, Dtype* data_im) { + col2im_gpu(data_col, channels, height, width, psize, pad, stride, + data_im); +} + +INSTANTIATE_CLASS(OpenCLDevice); + const char* clGetErrorString(cl_int error) { switch (error) { case CL_SUCCESS: From d9add068c3c8ca7a987c51645b15f7defc375ddf Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 17:15:54 +0800 Subject: [PATCH 36/75] Split OpenCLDevice::gemm into float and double --- src/caffe/util/opencl_device.cpp | 34 ++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index 8149997cde2..0f9a8930f45 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -5,8 +5,8 @@ namespace caffe { -template -void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, +template <> +void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const Dtype alpha, const Dtype* A, const Dtype* B, @@ -35,6 +35,36 @@ void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, RELEASE_CL_MEM(A); } +template <> +void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, + const int N, const int K, const Dtype alpha, + const Dtype* A, const Dtype* B, + const Dtype beta, Dtype* C) { + // Note that cublas follows fortran order. + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = (TransA == CblasNoTrans) ? N : M; + clblasTranspose clTransA = to_clblasTranspose(TransA); + clblasTranspose clTransB = to_clblasTranspose(TransB); + CREATE_CL_MEM(A, M, K, READ_ONLY); + CREATE_CL_MEM(B, K, N, READ_ONLY); + CREATE_CL_MEM(C, M, N, READ_WRITE); + ENQUEUE_CL_BUFFER(Write, A, M, K); + ENQUEUE_CL_BUFFER(Write, B, K, N); + ENQUEUE_CL_BUFFER(Write, C, M, N); + PRE_CLBLAS_CALL; + // bufX is defined by the macro CREATE_CL_MEM(X, ...) + CLBLAS_CHECK(clblasDgemm(clblasRowMajor, clTransA, clTransB, + M, N, K, &alpha, bufA, 0, lda, bufB, 0, ldb, &beta, bufC, 0, ldc + numCommandQueues, Caffe::opencl_queue(), numEventsInWaitList, + eventWaitList, &events)); + /* Release OpenCL memory objects. */ + RELEASE_CL_MEM(C); + RELEASE_CL_MEM(B); + RELEASE_CL_MEM(A); +} + template void OpenCLDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, const Dtype alpha, const Dtype* A, From 5d2645373ec4650ff5f35e2a3871c7fd3badcada Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 17:23:05 +0800 Subject: [PATCH 37/75] Add OpenCLDevice macro ARRAY & CLBALS_TRAILING_ARGS, edit CREATE_CL_MEM --- include/caffe/util/opencl_device.hpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/caffe/util/opencl_device.hpp b/include/caffe/util/opencl_device.hpp index 306cf0a1a6f..659241643a9 100644 --- a/include/caffe/util/opencl_device.hpp +++ b/include/caffe/util/opencl_device.hpp @@ -30,6 +30,7 @@ namespace caffe { } while (0) #define CREATE_CL_MEM(A, M, K, FLAG) \ + int ld##A = (Trans##A == CblasNoTrans) ? K : M do { \ cl_int error; cl_mem buf##A = clCreateBuffer( \ @@ -51,6 +52,12 @@ namespace caffe { cl_event *eventWaitList = NULL; \ cl_event events = NULL +#define ARRAY(A) buf##A, 0, ld##A + +#define CLBALS_TRAILING_ARGS \ + numCommandQueues, Caffe::opencl_queue(), numEventsInWaitList, \ + eventWaitList, &events + const char* clGetErrorString(cl_int error); const char* clblasGetErrorString(clblasStatus_t status); From fc14be2303b4f4a9054aafee45773b5ec445deac Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 17:38:21 +0800 Subject: [PATCH 38/75] Simplify OpenCLDevice::gemm with the new macros --- src/caffe/util/opencl_device.cpp | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index 0f9a8930f45..ca50a06233f 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -12,9 +12,9 @@ void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, const Dtype* A, const Dtype* B, const Dtype beta, Dtype* C) { // Note that cublas follows fortran order. - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - int ldc = (TransA == CblasNoTrans) ? N : M; + LEAD_DIM(A, M, K); + LEAD_DIM(B, K, N); + LEAD_DIM(C, M, N); clblasTranspose clTransA = to_clblasTranspose(TransA); clblasTranspose clTransB = to_clblasTranspose(TransB); CREATE_CL_MEM(A, M, K, READ_ONLY); @@ -26,9 +26,8 @@ void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, PRE_CLBLAS_CALL; // bufX is defined by the macro CREATE_CL_MEM(X, ...) CLBLAS_CHECK(clblasSgemm(clblasRowMajor, clTransA, clTransB, - M, N, K, &alpha, bufA, 0, lda, bufB, 0, ldb, &beta, bufC, 0, ldc - numCommandQueues, Caffe::opencl_queue(), numEventsInWaitList, - eventWaitList, &events)); + M, N, K, &alpha, ARRAY(A), ARRAY(B), &beta, ARRAY(C), + CLBALS_TRAILING_ARGS)); /* Release OpenCL memory objects. */ RELEASE_CL_MEM(C); RELEASE_CL_MEM(B); @@ -42,9 +41,9 @@ void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, const Dtype* A, const Dtype* B, const Dtype beta, Dtype* C) { // Note that cublas follows fortran order. - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - int ldc = (TransA == CblasNoTrans) ? N : M; + LEAD_DIM(A, M, K); + LEAD_DIM(B, K, N); + LEAD_DIM(C, M, N); clblasTranspose clTransA = to_clblasTranspose(TransA); clblasTranspose clTransB = to_clblasTranspose(TransB); CREATE_CL_MEM(A, M, K, READ_ONLY); @@ -56,9 +55,8 @@ void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, PRE_CLBLAS_CALL; // bufX is defined by the macro CREATE_CL_MEM(X, ...) CLBLAS_CHECK(clblasDgemm(clblasRowMajor, clTransA, clTransB, - M, N, K, &alpha, bufA, 0, lda, bufB, 0, ldb, &beta, bufC, 0, ldc - numCommandQueues, Caffe::opencl_queue(), numEventsInWaitList, - eventWaitList, &events)); + M, N, K, &alpha, ARRAY(A), ARRAY(B), &beta, ARRAY(C), + CLBALS_TRAILING_ARGS)); /* Release OpenCL memory objects. */ RELEASE_CL_MEM(C); RELEASE_CL_MEM(B); From 15c54e61ad029c5c60df14b3beed89757a38344d Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 17:42:16 +0800 Subject: [PATCH 39/75] Implement OpenCLDevice::gemv --- src/caffe/util/opencl_device.cpp | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index ca50a06233f..458b6a641b3 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -63,11 +63,30 @@ void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, RELEASE_CL_MEM(A); } -template -void OpenCLDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, +template <> +void OpenCLDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, Dtype* y) { - caffe_gpu_gemv(TransA, M, N, alpha, A, x, beta, y); + clblasTranspose clTransA = to_clblasTranspose(TransA); + CREATE_CL_MEM(A, M, N, READ_ONLY); + CREATE_CL_MEM(x, N, 1, READ_ONLY); + CREATE_CL_MEM(y, M, 1, READ_ONLY); + CLBLAS_CHECK(clblasSgemv(clblasRowMajor, clTransA, M, N, &alpha, + ARRAY(A), ARRAY(x), &beta, ARRAY(y), + CLBALS_TRAILING_ARGS)); +} + +template <> +void OpenCLDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const Dtype alpha, const Dtype* A, + const Dtype* x, const Dtype beta, Dtype* y) { + clblasTranspose clTransA = to_clblasTranspose(TransA); + CREATE_CL_MEM(A, M, N, READ_ONLY); + CREATE_CL_MEM(x, N, 1, READ_ONLY); + CREATE_CL_MEM(y, M, 1, READ_ONLY); + CLBLAS_CHECK(clblasDgemv(clblasRowMajor, clTransA, M, N, &alpha, + ARRAY(A), ARRAY(x), &beta, ARRAY(y), + CLBALS_TRAILING_ARGS)); } template From bc8da2acad98005a0170722f73a9ab45e157dff9 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 17:50:39 +0800 Subject: [PATCH 40/75] Implement OpenCLDevice::axpy and fix gemm, gemv --- src/caffe/util/opencl_device.cpp | 46 +++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index 458b6a641b3..c6319013528 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -8,9 +8,9 @@ namespace caffe { template <> void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, - const int N, const int K, const Dtype alpha, - const Dtype* A, const Dtype* B, - const Dtype beta, Dtype* C) { + const int N, const int K, const float alpha, + const float* A, const float* B, + const float beta, float* C) { // Note that cublas follows fortran order. LEAD_DIM(A, M, K); LEAD_DIM(B, K, N); @@ -37,9 +37,9 @@ void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, template <> void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, - const int N, const int K, const Dtype alpha, - const Dtype* A, const Dtype* B, - const Dtype beta, Dtype* C) { + const int N, const int K, const double alpha, + const double* A, const double* B, + const double beta, double* C) { // Note that cublas follows fortran order. LEAD_DIM(A, M, K); LEAD_DIM(B, K, N); @@ -65,8 +65,8 @@ void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, template <> void OpenCLDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const Dtype alpha, const Dtype* A, - const Dtype* x, const Dtype beta, Dtype* y) { + const int N, const float alpha, const float* A, + const float* x, const float beta, float* y) { clblasTranspose clTransA = to_clblasTranspose(TransA); CREATE_CL_MEM(A, M, N, READ_ONLY); CREATE_CL_MEM(x, N, 1, READ_ONLY); @@ -78,8 +78,8 @@ void OpenCLDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, template <> void OpenCLDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const Dtype alpha, const Dtype* A, - const Dtype* x, const Dtype beta, Dtype* y) { + const int N, const double alpha, const double* A, + const double* x, const double beta, double* y) { clblasTranspose clTransA = to_clblasTranspose(TransA); CREATE_CL_MEM(A, M, N, READ_ONLY); CREATE_CL_MEM(x, N, 1, READ_ONLY); @@ -89,10 +89,26 @@ void OpenCLDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, CLBALS_TRAILING_ARGS)); } -template -void OpenCLDevice::axpy(const int N, const Dtype alpha, const Dtype* X, - Dtype* Y) { - caffe_gpu_axpy(N, alpha, X, Y); +template <> +void OpenCLDevice::axpy(const int N, const float alpha, + const float* X, float* Y) { + CREATE_CL_MEM(X, N, 1, READ_ONLY); + CREATE_CL_MEM(Y, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CUBLAS_CHECK(clblasSaxpy( + N, &alpha, ARRAY(X), ARRAY(Y), + CLBALS_TRAILING_ARGS)); +} + +template <> +void OpenCLDevice::axpy(const int N, const double alpha, + const double* X, double* Y) { + CREATE_CL_MEM(X, N, 1, READ_ONLY); + CREATE_CL_MEM(Y, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CUBLAS_CHECK(clblasDaxpy( + N, &alpha, ARRAY(X), ARRAY(Y), + CLBALS_TRAILING_ARGS)); } template @@ -242,8 +258,6 @@ void OpenCLDevice::col2im(const Dtype* data_col, const int channels, data_im); } -INSTANTIATE_CLASS(OpenCLDevice); - const char* clGetErrorString(cl_int error) { switch (error) { case CL_SUCCESS: From c67b937d51120077a04985a54434670d8b7521f5 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 19:02:00 +0800 Subject: [PATCH 41/75] Implement OpenCLDevice::axpby --- src/caffe/util/opencl_device.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index c6319013528..ebe0e6c2681 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -111,10 +111,20 @@ void OpenCLDevice::axpy(const int N, const double alpha, CLBALS_TRAILING_ARGS)); } -template -void OpenCLDevice::axpby(const int N, const Dtype alpha, - const Dtype* X, const Dtype beta, Dtype* Y) { - caffe_gpu_axpby(N, alpha, X, beta, Y); +template <> +void OpenCLDevice::axpby( + const int N, const float alpha, const float* X, + const float beta, float* Y) { + this->scal(N, beta, Y); + this->axpy(N, alpha, X, Y); +} + +template <> +void OpenCLDevice::axpby( + const int N, const double alpha, const double* X, + const double beta, double* Y) { + this->scal(N, beta, Y); + this->axpy(N, alpha, X, Y); } template From 10f48f0d772aa320cb68b09e0a73d3ada204ea95 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 19:16:21 +0800 Subject: [PATCH 42/75] Implement OpenCLDevice<>::copy --- src/caffe/util/opencl_device.cpp | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index ebe0e6c2681..c46cb164336 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -70,7 +70,7 @@ void OpenCLDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, clblasTranspose clTransA = to_clblasTranspose(TransA); CREATE_CL_MEM(A, M, N, READ_ONLY); CREATE_CL_MEM(x, N, 1, READ_ONLY); - CREATE_CL_MEM(y, M, 1, READ_ONLY); + CREATE_CL_MEM(y, M, 1, READ_WRITE); CLBLAS_CHECK(clblasSgemv(clblasRowMajor, clTransA, M, N, &alpha, ARRAY(A), ARRAY(x), &beta, ARRAY(y), CLBALS_TRAILING_ARGS)); @@ -83,7 +83,7 @@ void OpenCLDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, clblasTranspose clTransA = to_clblasTranspose(TransA); CREATE_CL_MEM(A, M, N, READ_ONLY); CREATE_CL_MEM(x, N, 1, READ_ONLY); - CREATE_CL_MEM(y, M, 1, READ_ONLY); + CREATE_CL_MEM(y, M, 1, READ_WRITE); CLBLAS_CHECK(clblasDgemv(clblasRowMajor, clTransA, M, N, &alpha, ARRAY(A), ARRAY(x), &beta, ARRAY(y), CLBALS_TRAILING_ARGS)); @@ -127,9 +127,22 @@ void OpenCLDevice::axpby( this->axpy(N, alpha, X, Y); } -template -void OpenCLDevice::copy(const int N, const Dtype *X, Dtype *Y) { - caffe_gpu_copy(N, X, Y); +template <> +void OpenCLDevice::copy(const int N, const float *X, float *Y) { + CREATE_CL_MEM(X, N, 1, READ_ONLY); + CREATE_CL_MEM(Y, N, 1, READ_WRITE); + CLBLAS_CHECK(clblasScopy( + N, ARRAY(X), ARRAY(Y), + CLBALS_TRAILING_ARGS)); +} + +template <> +void OpenCLDevice::copy(const int N, const double *X, double *Y) { + CREATE_CL_MEM(X, N, 1, READ_ONLY); + CREATE_CL_MEM(Y, N, 1, READ_WRITE); + CLBLAS_CHECK(clblasScopy( + N, ARRAY(X), ARRAY(Y), + CLBALS_TRAILING_ARGS)); } template From d0aecd466c0fe0abb20282f92115532baab54500 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 19:26:24 +0800 Subject: [PATCH 43/75] Implement OpenCLDevice::copy_from_cpu with clEnqueueWriteBuffer --- include/caffe/util/opencl_device.hpp | 10 +++++----- src/caffe/util/opencl_device.cpp | 20 ++++++++++++++++++-- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/include/caffe/util/opencl_device.hpp b/include/caffe/util/opencl_device.hpp index 659241643a9..66ba4c4d7a3 100644 --- a/include/caffe/util/opencl_device.hpp +++ b/include/caffe/util/opencl_device.hpp @@ -47,16 +47,16 @@ namespace caffe { A, 0, NULL, NULL)); #define PRE_CLBLAS_CALL \ - cl_uint numCommandQueues = 1; \ - cl_uint numEventsInWaitList = 0; \ - cl_event *eventWaitList = NULL; \ + cl_uint num_command_queues = 1; \ + cl_uint num_events_in_wait_list = 0; \ + cl_event *event_wait_list = NULL; \ cl_event events = NULL #define ARRAY(A) buf##A, 0, ld##A #define CLBALS_TRAILING_ARGS \ - numCommandQueues, Caffe::opencl_queue(), numEventsInWaitList, \ - eventWaitList, &events + num_command_queues, Caffe::opencl_queue(), num_events_in_wait_list, \ + event_wait_list, &events const char* clGetErrorString(cl_int error); const char* clblasGetErrorString(clblasStatus_t status); diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index c46cb164336..cd2b74f7f1a 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -131,6 +131,7 @@ template <> void OpenCLDevice::copy(const int N, const float *X, float *Y) { CREATE_CL_MEM(X, N, 1, READ_ONLY); CREATE_CL_MEM(Y, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; CLBLAS_CHECK(clblasScopy( N, ARRAY(X), ARRAY(Y), CLBALS_TRAILING_ARGS)); @@ -140,6 +141,7 @@ template <> void OpenCLDevice::copy(const int N, const double *X, double *Y) { CREATE_CL_MEM(X, N, 1, READ_ONLY); CREATE_CL_MEM(Y, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; CLBLAS_CHECK(clblasScopy( N, ARRAY(X), ARRAY(Y), CLBALS_TRAILING_ARGS)); @@ -147,8 +149,22 @@ void OpenCLDevice::copy(const int N, const double *X, double *Y) { template void OpenCLDevice::copy_from_cpu(const int N, const Dtype *X, Dtype *Y) { - CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyHostToDevice)); -} + CREATE_CL_MEM(Y, N, 1, READ_WRITE); + cl_bool blocking_write = CL_TRUE; + cl_uint num_events_in_wait_list = 0; + cl_event *event_wait_list = NULL; + cl_event events = NULL; + CL_CHECK(clEnqueueWriteBuffer( + Caffe::opencl_queue(), bufY, blocking_write, 0, N * sizeof(Dtype), + X, num_events_in_wait_list, event_wait_list, &events)); +} + +template +void OpenCLDevice::copy_from_cpu(const int N, const float *X, + float *Y); +template +void OpenCLDevice::copy_from_cpu(const int N, const double *X, + double *Y); template void OpenCLDevice::set(const int N, const Dtype alpha, Dtype *X) { From 90942970af1263200f35d9402bbaa743e10ebbae Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 19:36:34 +0800 Subject: [PATCH 44/75] Implement OpenCLDevice::set with clEnqueueFillBuffer --- src/caffe/util/opencl_device.cpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index cd2b74f7f1a..9c60cc7f2fe 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -147,6 +147,9 @@ void OpenCLDevice::copy(const int N, const double *X, double *Y) { CLBALS_TRAILING_ARGS)); } +/** + * http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clEnqueueWriteBuffer.html + */ template void OpenCLDevice::copy_from_cpu(const int N, const Dtype *X, Dtype *Y) { CREATE_CL_MEM(Y, N, 1, READ_WRITE); @@ -166,11 +169,25 @@ template void OpenCLDevice::copy_from_cpu(const int N, const double *X, double *Y); +/** + * http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clEnqueueFillBuffer.html + */ template void OpenCLDevice::set(const int N, const Dtype alpha, Dtype *X) { - caffe_gpu_set(N, alpha, X); + CREATE_CL_MEM(X, N, 1, READ_WRITE); + cl_uint num_events_in_wait_list = 0; + cl_event *event_wait_list = NULL; + cl_event events = NULL; + CL_CHECK(clEnqueueFillBuffer( + Caffe::opencl_queue(), bufA, &alpha, sizeof(Dtype), 0, + sizeof(Dtype) * N, num_events_in_wait_list, event_wait_list, &event)); } +template +void OpenCLDevice::set(const int N, const float alpha, float *X); +template +void OpenCLDevice::set(const int N, const double alpha, double *X); + template void OpenCLDevice::add_scalar(const int N, const Dtype alpha, Dtype *X) { From 95371e2cdaf2db6e2af0d8b4be74d498ce8465ce Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 20:05:38 +0800 Subject: [PATCH 45/75] Implement OpenCLDevice::scale with copy and scal --- src/caffe/util/opencl_device.cpp | 82 +++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 23 deletions(-) diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index 9c60cc7f2fe..b3517bcd75e 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -142,7 +142,7 @@ void OpenCLDevice::copy(const int N, const double *X, double *Y) { CREATE_CL_MEM(X, N, 1, READ_ONLY); CREATE_CL_MEM(Y, N, 1, READ_WRITE); PRE_CLBLAS_CALL; - CLBLAS_CHECK(clblasScopy( + CLBLAS_CHECK(clblasDcopy( N, ARRAY(X), ARRAY(Y), CLBALS_TRAILING_ARGS)); } @@ -191,12 +191,25 @@ void OpenCLDevice::set(const int N, const double alpha, double *X); template void OpenCLDevice::add_scalar(const int N, const Dtype alpha, Dtype *X) { - caffe_gpu_add_scalar(N, alpha, X); + NOT_IMPLEMENTED; } -template -void OpenCLDevice::scal(const int N, const Dtype alpha, Dtype *X) { - caffe_gpu_scal(N, alpha, X); +template <> +void OpenCLDevice::scal(const int N, const float alpha, float *X) { + CREATE_CL_MEM(X, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasSscal( + N, alpha, ARRAY(X), + CLBALS_TRAILING_ARGS)); +} + +template <> +void OpenCLDevice::scal(const int N, const double alpha, double *X) { + CREATE_CL_MEM(X, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasDscal( + N, alpha, ARRAY(X), + CLBALS_TRAILING_ARGS)); } template @@ -208,43 +221,50 @@ void OpenCLDevice::sqr(const int N, const Dtype* a, Dtype* y) { template void OpenCLDevice::add(const int N, const Dtype* a, const Dtype* b, Dtype* y) { - caffe_gpu_add(N, a, b, y); + NOT_IMPLEMENTED; +// caffe_gpu_add(N, a, b, y); } template void OpenCLDevice::sub(const int N, const Dtype* a, const Dtype* b, Dtype* y) { - caffe_gpu_sub(N, a, b, y); + NOT_IMPLEMENTED; +// caffe_gpu_sub(N, a, b, y); } template void OpenCLDevice::mul(const int N, const Dtype* a, const Dtype* b, Dtype* y) { - caffe_gpu_mul(N, a, b, y); + NOT_IMPLEMENTED; +// caffe_gpu_mul(N, a, b, y); } template void OpenCLDevice::div(const int N, const Dtype* a, const Dtype* b, Dtype* y) { - caffe_gpu_div(N, a, b, y); + NOT_IMPLEMENTED; +// caffe_gpu_div(N, a, b, y); } template void OpenCLDevice::powx(const int N, const Dtype* a, const Dtype b, Dtype* y) { - caffe_gpu_powx(N, a, b, y); + NOT_IMPLEMENTED; +// caffe_gpu_powx(N, a, b, y); } template void OpenCLDevice::rng_uniform(const int N, const Dtype a, const Dtype b, Dtype* r) { - caffe_gpu_rng_uniform(N, a, b, r); + NOT_IMPLEMENTED; +// caffe_gpu_rng_uniform(N, a, b, r); } template void OpenCLDevice::rng_gaussian(const int N, const Dtype mu, const Dtype sigma, Dtype* r) { - caffe_gpu_rng_gaussian(N, mu, sigma, r); + NOT_IMPLEMENTED; +// caffe_gpu_rng_gaussian(N, mu, sigma, r); } template @@ -262,56 +282,72 @@ void OpenCLDevice::exp(const int N, const Dtype* a, Dtype* y) { template void OpenCLDevice::dot(const int N, const Dtype* x, const Dtype* y, Dtype* out) { - caffe_gpu_dot(N, x, y, out); + NOT_IMPLEMENTED; +// caffe_gpu_dot(N, x, y, out); } template void OpenCLDevice::hamming_distance(const int N, const Dtype* x, const Dtype* y, uint32_t* out) { - *out = caffe_gpu_hamming_distance(N, x, y); + NOT_IMPLEMENTED; +// *out = caffe_gpu_hamming_distance(N, x, y); } template // Returns the sum of the absolute values of the elements of vector x void OpenCLDevice::asum(const int N, const Dtype* x, Dtype* y) { - caffe_gpu_asum(N, x, y); + NOT_IMPLEMENTED; +// caffe_gpu_asum(N, x, y); } template void OpenCLDevice::sign(const int N, const Dtype* x, Dtype* y) { - caffe_gpu_sign(N, x, y); + NOT_IMPLEMENTED; +// caffe_gpu_sign(N, x, y); } template void OpenCLDevice::sgnbit(const int N, const Dtype* x, Dtype* y) { - caffe_gpu_sgnbit(N, x, y); + NOT_IMPLEMENTED; +// caffe_gpu_sgnbit(N, x, y); } template void OpenCLDevice::fabs(const int N, const Dtype* x, Dtype* y) { - caffe_gpu_fabs(N, x, y); + NOT_IMPLEMENTED; +// caffe_gpu_fabs(N, x, y); } template void OpenCLDevice::scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y) { - caffe_gpu_scale(N, alpha, x, y); + this->copy(N, x, y); + this->scal(N, alpha, y); } +template +void OpenCLDevice::scale(const int N, const float alpha, + const float *x, float* y); +template +void OpenCLDevice::scale(const int N, const double alpha, + const double *x, double* y); + template void OpenCLDevice::im2col(const Dtype* data_im, const int channels, const int height, const int width, const int ksize, const int pad, const int stride, Dtype* data_col) { - im2col_gpu(data_im, channels, height, width, ksize, pad, stride, - data_col); + NOT_IMPLEMENTED; +// im2col_gpu(data_im, channels, height, width, ksize, pad, stride, +// data_col); } template void OpenCLDevice::col2im(const Dtype* data_col, const int channels, const int height, const int width, const int psize, const int pad, const int stride, Dtype* data_im) { - col2im_gpu(data_col, channels, height, width, psize, pad, stride, - data_im); + NOT_IMPLEMENTED; +// col2im_gpu(data_col, channels, height, width, psize, pad, stride, +// data_im); } const char* clGetErrorString(cl_int error) { From 1c996f40eb82c5721a5824ed88ba3a21e6b8f799 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 20:45:53 +0800 Subject: [PATCH 46/75] Add OPENCL_KERNEL_LOOP and DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC --- include/caffe/util/opencl_device.hpp | 29 ++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/include/caffe/util/opencl_device.hpp b/include/caffe/util/opencl_device.hpp index 66ba4c4d7a3..25ea172f114 100644 --- a/include/caffe/util/opencl_device.hpp +++ b/include/caffe/util/opencl_device.hpp @@ -74,6 +74,35 @@ inline clblasTranspose to_clblasTranspose(const CBLAS_TRANSPOSE trans) { } } +// OpenCL: grid stride looping +#define OPENCL_KERNEL_LOOP(i, n) \ + for (int i = get_global_id(0); \ + i < (n); \ + i += get_global_size(0)) + +#define DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(name, operation) \ +template \ +__kernel void name##_kernel(__globalconst int n, __global const Dtype* a, \ + __globalconst Dtype* b, __global Dtype* y) { \ + OPENCL_KERNEL_LOOP(index, n) { \ + operation; \ + } \ +} \ +template <> \ +void caffe_opencl_##name(const int N, const float* a, \ + const float* b, float* y) { \ + /* NOLINT_NEXT_LINE(whitespace/operators) */ \ + name##_kernel<<>>( \ + N, a, b, y); \ +} \ +template <> \ +void caffe_opencl_##name(const int N, const double* a, \ + const double* b, double* y) { \ + /* NOLINT_NEXT_LINE(whitespace/operators) */ \ + name##_kernel<<>>( \ + N, a, b, y); \ +} + template class OpenCLDevice : public Device { public: From c0b3f7f43193c54c5965baffd34f323d2650800f Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 20:54:11 +0800 Subject: [PATCH 47/75] Declare, define and instantiate caffe_opencl_{add, sub, mul, div} --- include/caffe/util/opencl_device.hpp | 18 +++++- src/caffe/util/opencl_device.cpp | 82 +++++++++++++++++++--------- 2 files changed, 72 insertions(+), 28 deletions(-) diff --git a/include/caffe/util/opencl_device.hpp b/include/caffe/util/opencl_device.hpp index 25ea172f114..ff033dd73e9 100644 --- a/include/caffe/util/opencl_device.hpp +++ b/include/caffe/util/opencl_device.hpp @@ -84,7 +84,7 @@ inline clblasTranspose to_clblasTranspose(const CBLAS_TRANSPOSE trans) { template \ __kernel void name##_kernel(__globalconst int n, __global const Dtype* a, \ __globalconst Dtype* b, __global Dtype* y) { \ - OPENCL_KERNEL_LOOP(index, n) { \ + OPENCL_KERNEL_LOOP(i, n) { \ operation; \ } \ } \ @@ -103,6 +103,22 @@ void caffe_opencl_##name(const int N, const double* a, \ N, a, b, y); \ } +template +void caffe_opencl_add(const int N, const Dtype* a, + const Dtype* b, Dtype* y); + +template +void caffe_opencl_sub(const int N, const Dtype* a, + const Dtype* b, Dtype* y); + +template +void caffe_opencl_mul(const int N, const Dtype* a, + const Dtype* b, Dtype* y); + +template +void caffe_opencl_div(const int N, const Dtype* a, + const Dtype* b, Dtype* y); + template class OpenCLDevice : public Device { public: diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index b3517bcd75e..4e2a722392b 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -5,12 +5,17 @@ namespace caffe { +DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(add, y[i] = a[i] + b[i]); +DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(sub, y[i] = a[i] - b[i]); +DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(mul, y[i] = a[i] * b[i]); +DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(div, y[i] = a[i] / b[i]); + template <> void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, - const int N, const int K, const float alpha, - const float* A, const float* B, - const float beta, float* C) { + const CBLAS_TRANSPOSE TransB, const int M, + const int N, const int K, const float alpha, + const float* A, const float* B, + const float beta, float* C) { // Note that cublas follows fortran order. LEAD_DIM(A, M, K); LEAD_DIM(B, K, N); @@ -36,10 +41,10 @@ void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, template <> void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, - const int N, const int K, const double alpha, - const double* A, const double* B, - const double beta, double* C) { + const CBLAS_TRANSPOSE TransB, const int M, + const int N, const int K, const double alpha, + const double* A, const double* B, + const double beta, double* C) { // Note that cublas follows fortran order. LEAD_DIM(A, M, K); LEAD_DIM(B, K, N); @@ -65,8 +70,8 @@ void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, template <> void OpenCLDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const float alpha, const float* A, - const float* x, const float beta, float* y) { + const int N, const float alpha, const float* A, + const float* x, const float beta, float* y) { clblasTranspose clTransA = to_clblasTranspose(TransA); CREATE_CL_MEM(A, M, N, READ_ONLY); CREATE_CL_MEM(x, N, 1, READ_ONLY); @@ -77,9 +82,10 @@ void OpenCLDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, } template <> -void OpenCLDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const double alpha, const double* A, - const double* x, const double beta, double* y) { +void OpenCLDevice::gemv( + const CBLAS_TRANSPOSE TransA, const int M, + const int N, const double alpha, const double* A, + const double* x, const double beta, double* y) { clblasTranspose clTransA = to_clblasTranspose(TransA); CREATE_CL_MEM(A, M, N, READ_ONLY); CREATE_CL_MEM(x, N, 1, READ_ONLY); @@ -151,7 +157,8 @@ void OpenCLDevice::copy(const int N, const double *X, double *Y) { * http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clEnqueueWriteBuffer.html */ template -void OpenCLDevice::copy_from_cpu(const int N, const Dtype *X, Dtype *Y) { +void OpenCLDevice::copy_from_cpu(const int N, const Dtype *X, + Dtype *Y) { CREATE_CL_MEM(Y, N, 1, READ_WRITE); cl_bool blocking_write = CL_TRUE; cl_uint num_events_in_wait_list = 0; @@ -190,7 +197,7 @@ void OpenCLDevice::set(const int N, const double alpha, double *X); template void OpenCLDevice::add_scalar(const int N, const Dtype alpha, - Dtype *X) { + Dtype *X) { NOT_IMPLEMENTED; } @@ -220,49 +227,50 @@ void OpenCLDevice::sqr(const int N, const Dtype* a, Dtype* y) { template void OpenCLDevice::add(const int N, const Dtype* a, const Dtype* b, - Dtype* y) { + Dtype* y) { + NOT_IMPLEMENTED; // caffe_gpu_add(N, a, b, y); } template void OpenCLDevice::sub(const int N, const Dtype* a, const Dtype* b, - Dtype* y) { + Dtype* y) { NOT_IMPLEMENTED; // caffe_gpu_sub(N, a, b, y); } template void OpenCLDevice::mul(const int N, const Dtype* a, const Dtype* b, - Dtype* y) { + Dtype* y) { NOT_IMPLEMENTED; // caffe_gpu_mul(N, a, b, y); } template void OpenCLDevice::div(const int N, const Dtype* a, const Dtype* b, - Dtype* y) { + Dtype* y) { NOT_IMPLEMENTED; // caffe_gpu_div(N, a, b, y); } template void OpenCLDevice::powx(const int N, const Dtype* a, const Dtype b, - Dtype* y) { + Dtype* y) { NOT_IMPLEMENTED; // caffe_gpu_powx(N, a, b, y); } template void OpenCLDevice::rng_uniform(const int N, const Dtype a, - const Dtype b, Dtype* r) { + const Dtype b, Dtype* r) { NOT_IMPLEMENTED; // caffe_gpu_rng_uniform(N, a, b, r); } template void OpenCLDevice::rng_gaussian(const int N, const Dtype mu, - const Dtype sigma, Dtype* r) { + const Dtype sigma, Dtype* r) { NOT_IMPLEMENTED; // caffe_gpu_rng_gaussian(N, mu, sigma, r); } @@ -281,23 +289,43 @@ void OpenCLDevice::exp(const int N, const Dtype* a, Dtype* y) { template void OpenCLDevice::dot(const int N, const Dtype* x, const Dtype* y, - Dtype* out) { + Dtype* out) { NOT_IMPLEMENTED; // caffe_gpu_dot(N, x, y, out); } template void OpenCLDevice::hamming_distance(const int N, const Dtype* x, - const Dtype* y, uint32_t* out) { + const Dtype* y, uint32_t* out) { NOT_IMPLEMENTED; // *out = caffe_gpu_hamming_distance(N, x, y); } +/** + * +clblasSasum( + size_t N, + cl_mem asum, + size_t offAsum, + const cl_mem X, + size_t offx, + int incx, + cl_mem scratchBuff, + cl_uint numCommandQueues, + cl_command_queue *commandQueues, + cl_uint numEventsInWaitList, + const cl_event *eventWaitList, + cl_event *events) + */ template -// Returns the sum of the absolute values of the elements of vector x void OpenCLDevice::asum(const int N, const Dtype* x, Dtype* y) { NOT_IMPLEMENTED; -// caffe_gpu_asum(N, x, y); +// CREATE_CL_MEM(x, N, 1, READ_ONLY); +// CREATE_CL_MEM(y, N, 1, READ_WRITE); +// PRE_CLBLAS_CALL; +// CLBLAS_CHECK(clblasSasum( +// N, alpha, ARRAY(X), +// CLBALS_TRAILING_ARGS)); } template @@ -320,7 +348,7 @@ void OpenCLDevice::fabs(const int N, const Dtype* x, Dtype* y) { template void OpenCLDevice::scale(const int N, const Dtype alpha, - const Dtype *x, Dtype* y) { + const Dtype *x, Dtype* y) { this->copy(N, x, y); this->scal(N, alpha, y); } From 238a712c668a021711b9f0519ad1e89a3f55f703 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 20:56:35 +0800 Subject: [PATCH 48/75] Use caffe_opencl_{add,sub,mul,div} in OpenCLDevice::{add,sub,mul,div} --- src/caffe/util/opencl_device.cpp | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index 4e2a722392b..4a81c834b8e 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -228,30 +228,25 @@ void OpenCLDevice::sqr(const int N, const Dtype* a, Dtype* y) { template void OpenCLDevice::add(const int N, const Dtype* a, const Dtype* b, Dtype* y) { - - NOT_IMPLEMENTED; -// caffe_gpu_add(N, a, b, y); + caffe_opencl_add(N, a, b, y); } template void OpenCLDevice::sub(const int N, const Dtype* a, const Dtype* b, Dtype* y) { - NOT_IMPLEMENTED; -// caffe_gpu_sub(N, a, b, y); + caffe_opencl_sub(N, a, b, y); } template void OpenCLDevice::mul(const int N, const Dtype* a, const Dtype* b, Dtype* y) { - NOT_IMPLEMENTED; -// caffe_gpu_mul(N, a, b, y); + caffe_opencl_mul(N, a, b, y); } template void OpenCLDevice::div(const int N, const Dtype* a, const Dtype* b, Dtype* y) { - NOT_IMPLEMENTED; -// caffe_gpu_div(N, a, b, y); + caffe_opencl_div(N, a, b, y); } template From a4bf96b799e22c559f3c4a7dcb758fb47b2e99ea Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 21:15:46 +0800 Subject: [PATCH 49/75] Add the macro DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC --- include/caffe/util/opencl_device.hpp | 36 ++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/include/caffe/util/opencl_device.hpp b/include/caffe/util/opencl_device.hpp index ff033dd73e9..fbe485c13c5 100644 --- a/include/caffe/util/opencl_device.hpp +++ b/include/caffe/util/opencl_device.hpp @@ -80,24 +80,46 @@ inline clblasTranspose to_clblasTranspose(const CBLAS_TRANSPOSE trans) { i < (n); \ i += get_global_size(0)) +#define DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(name, operation) \ +template \ +__kernel void name##_kernel(const int n, const Dtype* x, Dtype* y) { \ + OPENCL_KERNEL_LOOP(index, n) { \ + operation; \ + } \ +} \ +template <> \ +void caffe_opencl_##name(const int n, const float* x, float* y) { \ + /* NOLINT_NEXT_LINE(whitespace/operators) */ \ + name##_kernel<<>>( \ + n, x, y); \ +} \ +template <> \ +void caffe_opencl_##name(const int n, const double* x, double* y) { \ + /* NOLINT_NEXT_LINE(whitespace/operators) */ \ + name##_kernel<<>>( \ + n, x, y); \ +} + #define DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(name, operation) \ template \ -__kernel void name##_kernel(__globalconst int n, __global const Dtype* a, \ - __globalconst Dtype* b, __global Dtype* y) { \ +__kernel void name##_kernel(__global const int n, __global const Dtype* a, \ + __global const Dtype* b, __global Dtype* y) { \ OPENCL_KERNEL_LOOP(i, n) { \ - operation; \ + operation; \ } \ } \ template <> \ -void caffe_opencl_##name(const int N, const float* a, \ - const float* b, float* y) { \ +void caffe_opencl_##name( \ + __global const int N, __global const float* a, \ + __global const float* b, __global float* y) { \ /* NOLINT_NEXT_LINE(whitespace/operators) */ \ name##_kernel<<>>( \ N, a, b, y); \ } \ template <> \ -void caffe_opencl_##name(const int N, const double* a, \ - const double* b, double* y) { \ +void caffe_opencl_##name( \ + __global const int N, __global const double* a, \ + __global const double* b, __global double* y) { \ /* NOLINT_NEXT_LINE(whitespace/operators) */ \ name##_kernel<<>>( \ N, a, b, y); \ From cf8e21e997b6b43eb2859b7844605e1ac130371d Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 21:30:01 +0800 Subject: [PATCH 50/75] Define and instantiate caffe_opencl_{sqr, exp, sign, sgnbit, fabs} --- include/caffe/util/opencl_device.hpp | 15 +++++++++++++++ src/caffe/util/opencl_device.cpp | 6 ++++++ 2 files changed, 21 insertions(+) diff --git a/include/caffe/util/opencl_device.hpp b/include/caffe/util/opencl_device.hpp index fbe485c13c5..3e7cf563b2c 100644 --- a/include/caffe/util/opencl_device.hpp +++ b/include/caffe/util/opencl_device.hpp @@ -125,6 +125,21 @@ void caffe_opencl_##name( \ N, a, b, y); \ } +template +void caffe_opencl_sqr(const int n, const Dtype* x, Dtype* y); + +template +void caffe_opencl_exp(const int n, const Dtype* x, Dtype* y); + +template +void caffe_opencl_sign(const int n, const Dtype* x, Dtype* y); + +template +void caffe_opencl_sgnbit(const int n, const Dtype* x, Dtype* y); + +template +void caffe_opencl_fabs(const int n, const Dtype* x, Dtype* y); + template void caffe_opencl_add(const int N, const Dtype* a, const Dtype* b, Dtype* y); diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index 4a81c834b8e..7c547e55aa0 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -5,6 +5,12 @@ namespace caffe { +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sqr, y[i] = x[i] * x[i]); +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(exp, y[i] = exp(x[i])); +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sign, y[i] = sign(x[i])); +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sgnbit, y[i] = signbit(x[i])); +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(fabs, y[i] = fabs(x[i])); + DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(add, y[i] = a[i] + b[i]); DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(sub, y[i] = a[i] - b[i]); DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(mul, y[i] = a[i] * b[i]); From a245eccd9bbcee19d35e82f4d3a4b93a850686db Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 21:33:37 +0800 Subject: [PATCH 51/75] Use caffe_opencl_{sqr,exp,sign,sgnbit,fabs} in OpenCLDevice::{...} --- src/caffe/util/opencl_device.cpp | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index 7c547e55aa0..0b591cda484 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -227,8 +227,7 @@ void OpenCLDevice::scal(const int N, const double alpha, double *X) { template void OpenCLDevice::sqr(const int N, const Dtype* a, Dtype* y) { - NOT_IMPLEMENTED; -// caffe_gpu_sqr(N, a, y); + caffe_opencl_sqr(N, a, y); } template @@ -284,8 +283,7 @@ void OpenCLDevice::rng_bernoulli(const int N, const Dtype p, int* r) { template void OpenCLDevice::exp(const int N, const Dtype* a, Dtype* y) { - NOT_IMPLEMENTED; -// caffe_gpu_exp(N, a, y); + caffe_opencl_exp(N, a, y); } template @@ -331,20 +329,17 @@ void OpenCLDevice::asum(const int N, const Dtype* x, Dtype* y) { template void OpenCLDevice::sign(const int N, const Dtype* x, Dtype* y) { - NOT_IMPLEMENTED; -// caffe_gpu_sign(N, x, y); + caffe_opencl_sign(N, x, y); } template void OpenCLDevice::sgnbit(const int N, const Dtype* x, Dtype* y) { - NOT_IMPLEMENTED; -// caffe_gpu_sgnbit(N, x, y); + caffe_opencl_sgnbit(N, x, y); } template void OpenCLDevice::fabs(const int N, const Dtype* x, Dtype* y) { - NOT_IMPLEMENTED; -// caffe_gpu_fabs(N, x, y); + caffe_opencl_fabs(N, x, y); } template From 3cb16cc1fb127907dc5b20971dc9174df67685c1 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 21:43:49 +0800 Subject: [PATCH 52/75] Move the definitions of OpenCLDevice unary & binary methods into macros --- include/caffe/util/opencl_device.hpp | 25 ++++++++++++-- src/caffe/util/opencl_device.cpp | 49 ---------------------------- 2 files changed, 23 insertions(+), 51 deletions(-) diff --git a/include/caffe/util/opencl_device.hpp b/include/caffe/util/opencl_device.hpp index 3e7cf563b2c..a35bd9ef3f5 100644 --- a/include/caffe/util/opencl_device.hpp +++ b/include/caffe/util/opencl_device.hpp @@ -98,7 +98,16 @@ void caffe_opencl_##name(const int n, const double* x, double* y) { \ /* NOLINT_NEXT_LINE(whitespace/operators) */ \ name##_kernel<<>>( \ n, x, y); \ -} +} \ +template \ +void OpenCLDevice::name(const int N, const Dtype* x, Dtype* y) { \ + caffe_opencl_##name(N, x, y); \ +} \ +template \ +void OpenCLDevice::name(const int N, const float* x, float* y); \ +template \ +void OpenCLDevice::name(const int N, const double* x, double* y); + #define DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(name, operation) \ template \ @@ -123,7 +132,19 @@ void caffe_opencl_##name( \ /* NOLINT_NEXT_LINE(whitespace/operators) */ \ name##_kernel<<>>( \ N, a, b, y); \ -} +} \ +template \ +void OpenCLDevice::name(const int N, const Dtype* a, const Dtype* b, \ + Dtype* y) { \ + caffe_opencl_##name(N, x, y); \ +} \ +template \ +void OpenCLDevice::name(const int N, const float* a, const float* b, \ + float* y); \ +template \ +void OpenCLDevice::name(const int N, const double* a, \ + const double* b, double* y); + template void caffe_opencl_sqr(const int n, const Dtype* x, Dtype* y); diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index 0b591cda484..d164241e32d 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -225,35 +225,6 @@ void OpenCLDevice::scal(const int N, const double alpha, double *X) { CLBALS_TRAILING_ARGS)); } -template -void OpenCLDevice::sqr(const int N, const Dtype* a, Dtype* y) { - caffe_opencl_sqr(N, a, y); -} - -template -void OpenCLDevice::add(const int N, const Dtype* a, const Dtype* b, - Dtype* y) { - caffe_opencl_add(N, a, b, y); -} - -template -void OpenCLDevice::sub(const int N, const Dtype* a, const Dtype* b, - Dtype* y) { - caffe_opencl_sub(N, a, b, y); -} - -template -void OpenCLDevice::mul(const int N, const Dtype* a, const Dtype* b, - Dtype* y) { - caffe_opencl_mul(N, a, b, y); -} - -template -void OpenCLDevice::div(const int N, const Dtype* a, const Dtype* b, - Dtype* y) { - caffe_opencl_div(N, a, b, y); -} - template void OpenCLDevice::powx(const int N, const Dtype* a, const Dtype b, Dtype* y) { @@ -281,11 +252,6 @@ void OpenCLDevice::rng_bernoulli(const int N, const Dtype p, int* r) { // caffe_gpu_rng_bernoulli(N, p, r); } -template -void OpenCLDevice::exp(const int N, const Dtype* a, Dtype* y) { - caffe_opencl_exp(N, a, y); -} - template void OpenCLDevice::dot(const int N, const Dtype* x, const Dtype* y, Dtype* out) { @@ -327,21 +293,6 @@ void OpenCLDevice::asum(const int N, const Dtype* x, Dtype* y) { // CLBALS_TRAILING_ARGS)); } -template -void OpenCLDevice::sign(const int N, const Dtype* x, Dtype* y) { - caffe_opencl_sign(N, x, y); -} - -template -void OpenCLDevice::sgnbit(const int N, const Dtype* x, Dtype* y) { - caffe_opencl_sgnbit(N, x, y); -} - -template -void OpenCLDevice::fabs(const int N, const Dtype* x, Dtype* y) { - caffe_opencl_fabs(N, x, y); -} - template void OpenCLDevice::scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y) { From 65342d8db7f3686736e877267ec9f397a85347bd Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 22:00:03 +0800 Subject: [PATCH 53/75] Replace Caffe::opencl_queue with OpenCLDevice::cl_command_queue --- include/caffe/util/opencl_device.hpp | 6 +++++ src/caffe/util/opencl_device.cpp | 34 ++++++++++++++++++++++++++-- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/include/caffe/util/opencl_device.hpp b/include/caffe/util/opencl_device.hpp index a35bd9ef3f5..fab89321722 100644 --- a/include/caffe/util/opencl_device.hpp +++ b/include/caffe/util/opencl_device.hpp @@ -184,6 +184,7 @@ class OpenCLDevice : public Device { } virtual ~OpenCLDevice() { } + virtual void gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, Dtype* C); @@ -250,6 +251,11 @@ class OpenCLDevice : public Device { virtual void col2im(const Dtype* data_col, const int channels, const int height, const int width, const int psize, const int pad, const int stride, Dtype* data_im); + + inline static cl_command_queue queue(); + private: + static cl_command_queue cl_command_queue_; + static bool cl_command_queue_created_; }; diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index d164241e32d..b4299e01784 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -5,6 +5,36 @@ namespace caffe { +template +OpenCLDevice::cl_command_queue_created_ = false; + +/** + * http://opencl.codeplex.com/wikipage?title=OpenCL%20Tutorials%20-%201 + */ +template +OpenCLDevice::cl_command_queue queue() { + if (cl_command_queue_created_) { + return cl_command_queue_; + } else { + cl_int error = 0; // Used to handle error codes + cl_platform_id platform; + cl_context context; + cl_command_queue queue; + cl_device_id device; + // Platform + CL_CHECK(oclGetPlatformID(&platform)); + // Device + CL_CHECK(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL)); + // Context + context = clCreateContext(0, 1, &device, NULL, NULL, &error); + CL_CHECK(error); + // Command-queue + queue = clCreateCommandQueue(context, device, 0, &error); + CL_CHECK(error); + cl_command_queue_created_ = true; + } +} + DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sqr, y[i] = x[i] * x[i]); DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(exp, y[i] = exp(x[i])); DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sign, y[i] = sign(x[i])); @@ -171,7 +201,7 @@ void OpenCLDevice::copy_from_cpu(const int N, const Dtype *X, cl_event *event_wait_list = NULL; cl_event events = NULL; CL_CHECK(clEnqueueWriteBuffer( - Caffe::opencl_queue(), bufY, blocking_write, 0, N * sizeof(Dtype), + OpenCLDevice::queue(), bufY, blocking_write, 0, N * sizeof(Dtype), X, num_events_in_wait_list, event_wait_list, &events)); } @@ -192,7 +222,7 @@ void OpenCLDevice::set(const int N, const Dtype alpha, Dtype *X) { cl_event *event_wait_list = NULL; cl_event events = NULL; CL_CHECK(clEnqueueFillBuffer( - Caffe::opencl_queue(), bufA, &alpha, sizeof(Dtype), 0, + OpenCLDevice::queue(), bufA, &alpha, sizeof(Dtype), 0, sizeof(Dtype) * N, num_events_in_wait_list, event_wait_list, &event)); } From a07ec1832d107020d62782afc391b80f716e9822 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sun, 8 Jun 2014 22:10:26 +0800 Subject: [PATCH 54/75] Add a new Brew::OPENCL and OpenCL Device in DeviceFactory --- include/caffe/common.hpp | 2 +- include/caffe/util/device.hpp | 1 + src/caffe/util/device.cpp | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index bd4e39f136d..722a7198bc6 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -74,7 +74,7 @@ class Caffe { } return *singleton_; } - enum Brew { CPU, GPU }; + enum Brew { CPU, GPU, OPENCL }; enum Phase { TRAIN, TEST }; diff --git a/include/caffe/util/device.hpp b/include/caffe/util/device.hpp index c7cc21baa66..9b5daf284ce 100644 --- a/include/caffe/util/device.hpp +++ b/include/caffe/util/device.hpp @@ -247,6 +247,7 @@ class DeviceFactory { private: static Device* cpu_device_; static Device* gpu_device_; + static Device* opencl_device_; }; } // namespace caffe diff --git a/src/caffe/util/device.cpp b/src/caffe/util/device.cpp index 4eb59dc6861..7c6a7e811ea 100644 --- a/src/caffe/util/device.cpp +++ b/src/caffe/util/device.cpp @@ -13,6 +13,8 @@ DeviceFactory::GetDevice() { return cpu_device_; case Caffe::GPU: return gpu_device_; + case Caffe::OPENCL: + return opencl_device_; default: LOG(FATAL) << "Unknown caffe mode."; return static_cast*>(NULL); From b0dc9b3c68e969209efdce89a218700e28cc2bb2 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Wed, 18 Jun 2014 11:47:07 +0800 Subject: [PATCH 55/75] Device wrapper methods no longer pure virtual, default not implemented --- include/caffe/util/device.hpp | 70 +++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/include/caffe/util/device.hpp b/include/caffe/util/device.hpp index 9b5daf284ce..ef3e9d5cdbd 100644 --- a/include/caffe/util/device.hpp +++ b/include/caffe/util/device.hpp @@ -21,73 +21,85 @@ class Device { virtual void gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, - Dtype* C) = 0; + Dtype* C) { NOT_IMPLEMENTED; } virtual void gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, const Dtype alpha, const Dtype* A, const Dtype* x, - const Dtype beta, Dtype* y) = 0; + const Dtype beta, Dtype* y) { NOT_IMPLEMENTED; } virtual void axpy(const int N, const Dtype alpha, const Dtype* X, - Dtype* Y) = 0; + Dtype* Y) { NOT_IMPLEMENTED; } virtual void axpby(const int N, const Dtype alpha, const Dtype* X, - const Dtype beta, Dtype* Y) = 0; + const Dtype beta, Dtype* Y) { NOT_IMPLEMENTED; } - virtual void copy(const int N, const Dtype *X, Dtype *Y) = 0; - virtual void copy_from_cpu(const int N, const Dtype* X, Dtype* Y) = 0; + virtual void copy(const int N, const Dtype *X, Dtype *Y) { NOT_IMPLEMENTED; } + virtual void copy_from_cpu(const int N, const Dtype* X, Dtype* Y) { + NOT_IMPLEMENTED; } - virtual void set(const int N, const Dtype alpha, Dtype *X) = 0; + virtual void set(const int N, const Dtype alpha, Dtype *X) { + NOT_IMPLEMENTED; } - virtual void add_scalar(const int N, const Dtype alpha, Dtype *X) = 0; + virtual void add_scalar(const int N, const Dtype alpha, Dtype *X) { + NOT_IMPLEMENTED; } - virtual void scal(const int N, const Dtype alpha, Dtype *X) = 0; + virtual void scal(const int N, const Dtype alpha, Dtype *X) { + NOT_IMPLEMENTED; } - virtual void sqr(const int N, const Dtype* a, Dtype* y) = 0; + virtual void sqr(const int N, const Dtype* a, Dtype* y) { NOT_IMPLEMENTED; } - virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y) = 0; + virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y) { + NOT_IMPLEMENTED; } - virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y) = 0; + virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y) { + NOT_IMPLEMENTED; } - virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y) = 0; + virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y) { + NOT_IMPLEMENTED; } - virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y) = 0; + virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y) { + NOT_IMPLEMENTED; } - virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y) = 0; + virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y) { + NOT_IMPLEMENTED; } virtual void rng_uniform(const int N, const Dtype a, const Dtype b, - Dtype* r) = 0; + Dtype* r) { NOT_IMPLEMENTED; } virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma, - Dtype* r) = 0; + Dtype* r) { NOT_IMPLEMENTED; } - virtual void rng_bernoulli(const int N, const Dtype p, int* r) = 0; + virtual void rng_bernoulli(const int N, const Dtype p, int* r) { + NOT_IMPLEMENTED; } - virtual void exp(const int N, const Dtype* a, Dtype* y) = 0; + virtual void exp(const int N, const Dtype* a, Dtype* y) { NOT_IMPLEMENTED; } - virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out) = 0; + virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out) { + NOT_IMPLEMENTED; } virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y, - uint32_t* out) = 0; + uint32_t* out) { NOT_IMPLEMENTED; } // Returns the sum of the absolute values of the elements of vector x - virtual void asum(const int N, const Dtype* x, Dtype* y) = 0; + virtual void asum(const int N, const Dtype* x, Dtype* y) { NOT_IMPLEMENTED; } - virtual void sign(const int N, const Dtype* x, Dtype* y) = 0; + virtual void sign(const int N, const Dtype* x, Dtype* y) { NOT_IMPLEMENTED; } - virtual void sgnbit(const int N, const Dtype* x, Dtype* y) = 0; + virtual void sgnbit(const int N, const Dtype* x, Dtype* y) { + NOT_IMPLEMENTED; } - virtual void fabs(const int N, const Dtype* x, Dtype* y) = 0; + virtual void fabs(const int N, const Dtype* x, Dtype* y) { NOT_IMPLEMENTED; } virtual void scale(const int N, const Dtype alpha, const Dtype *x, - Dtype* y) = 0; + Dtype* y) { NOT_IMPLEMENTED; } virtual void im2col(const Dtype* data_im, const int channels, const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col) = 0; + const int stride, Dtype* data_col) { NOT_IMPLEMENTED; } virtual void col2im(const Dtype* data_col, const int channels, const int height, const int width, const int psize, const int pad, - const int stride, Dtype* data_im) = 0; + const int stride, Dtype* data_im) { NOT_IMPLEMENTED; } }; template @@ -247,7 +259,9 @@ class DeviceFactory { private: static Device* cpu_device_; static Device* gpu_device_; +#ifdef USE_OPENCL static Device* opencl_device_; +#endif }; } // namespace caffe From 821c74d7b0d8a9364e9c2d5bf3c4e3fa1837fdd2 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Wed, 18 Jun 2014 11:49:25 +0800 Subject: [PATCH 56/75] DeviceFactory opts out OpenCLDevice by default, users can opt in --- src/caffe/util/device.cpp | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/caffe/util/device.cpp b/src/caffe/util/device.cpp index 7c6a7e811ea..81c2fa83cdd 100644 --- a/src/caffe/util/device.cpp +++ b/src/caffe/util/device.cpp @@ -2,22 +2,26 @@ #include "caffe/common.hpp" #include "caffe/util/device.hpp" +#ifdef USE_OPENCL +#include "caffe/util/opencl_device.hpp" +#endif namespace caffe { template -Device* -DeviceFactory::GetDevice() { +Device* DeviceFactory::GetDevice() { switch (Caffe::mode()) { - case Caffe::CPU: - return cpu_device_; - case Caffe::GPU: - return gpu_device_; - case Caffe::OPENCL: - return opencl_device_; - default: - LOG(FATAL) << "Unknown caffe mode."; - return static_cast*>(NULL); + case Caffe::CPU: + return cpu_device_; + case Caffe::GPU: + return gpu_device_; +#ifdef USE_OPENCL + case Caffe::OPENCL: + return opencl_device_; +#endif + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast*>(NULL); } } @@ -27,6 +31,11 @@ Device* DeviceFactory::cpu_device_ = new CPUDevice(); template Device* DeviceFactory::gpu_device_ = new GPUDevice(); +#ifdef USE_OPENCL +template +Device* DeviceFactory::opencl_device_ = new OpenCLDevice(); +#endif + INSTANTIATE_CLASS(DeviceFactory); } // namespace caffe From dc7e05f2fb3b2071815647a1b460572966dac864 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Wed, 18 Jun 2014 11:52:06 +0800 Subject: [PATCH 57/75] Add variables in Makefile and config to build OpenCL related codes --- Makefile | 8 ++++++++ Makefile.config.example | 7 +++++++ 2 files changed, 15 insertions(+) diff --git a/Makefile b/Makefile index 77e2ff5edbe..41d594851f0 100644 --- a/Makefile +++ b/Makefile @@ -224,6 +224,14 @@ endif INCLUDE_DIRS += $(BLAS_INCLUDE) LIBRARY_DIRS += $(BLAS_LIB) +OPENCL ?= 0 +ifeq ($(OPENCL), 1) + INCLUDE_DIRS += $(OPENCL_INCLUDE_DIR) $(CLBLAS_INCLUDE_DIR) + LIBRARY_DIRS += $(OPENCL_LIB_DIR) $(CLBLAS_LIB_DIR) + LIBRARIES += $(OPENCL_LIBS) $(CLBLAS_LIBS) + COMMON_FLAGS += -DUSE_OPENCL +endif + # Complete build flags. COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) CXXFLAGS += -pthread -fPIC $(COMMON_FLAGS) $(WARNINGS) diff --git a/Makefile.config.example b/Makefile.config.example index 73c3740b1c7..4bb904c54b4 100644 --- a/Makefile.config.example +++ b/Makefile.config.example @@ -46,6 +46,13 @@ PYTHON_INCLUDE := /usr/local/include/python2.7 \ PYTHON_LIB := /usr/local/lib # PYTHON_LIB := $(HOME)/anaconda/lib +OPENCL_INCLUDE_DIR := /opt/AMDAPP/include/ +OPENCL_LIB_DIR := /opt/AMDAPP/lib/x86_64/ +OPENCL_LIBS := OpenCL +CLBLAS_INCLUDE_DIR := /home/user/Codes/clBLAS/src/package/include +CLBLAS_LIB_DIR := /home/user/Codes/clBLAS/src/package/lib64 +CLBLAS_LIBS := clBLAS + # Whatever else you find you need goes here. INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib From 20eaad88661df2c2433557b0f0ca0364512532ef Mon Sep 17 00:00:00 2001 From: Kai Li Date: Wed, 18 Jun 2014 11:54:37 +0800 Subject: [PATCH 58/75] Fix all the issues in OpenCLDevice that prevent successful building --- include/caffe/util/opencl_device.hpp | 122 ++++--------- src/caffe/util/opencl_device.cl | 86 +++++++++ src/caffe/util/opencl_device.cpp | 253 +++++++++++++++++++-------- 3 files changed, 297 insertions(+), 164 deletions(-) create mode 100644 src/caffe/util/opencl_device.cl diff --git a/include/caffe/util/opencl_device.hpp b/include/caffe/util/opencl_device.hpp index fab89321722..73ee4a024be 100644 --- a/include/caffe/util/opencl_device.hpp +++ b/include/caffe/util/opencl_device.hpp @@ -8,6 +8,7 @@ #else #include #endif +#include "clBLAS.h" #include "glog/logging.h" @@ -24,42 +25,43 @@ namespace caffe { #define CLBLAS_CHECK(condition) \ do { \ - clblasStatus_t status = condition; \ + clblasStatus status = condition; \ CHECK_EQ(status, clblasSuccess) << " " \ << caffe::clblasGetErrorString(status); \ } while (0) #define CREATE_CL_MEM(A, M, K, FLAG) \ - int ld##A = (Trans##A == CblasNoTrans) ? K : M + cl_mem buf##A; \ do { \ - cl_int error; - cl_mem buf##A = clCreateBuffer( \ - Caffe::opencl_context(), CL_MEM_##FLAG, M * K * sizeof(*A), \ + cl_int error; \ + buf##A = clCreateBuffer( \ + OpenCLDevice::context(), CL_MEM_##FLAG, M * K * sizeof(*A), \ NULL, &error); \ CL_CHECK(error); \ } while(0) -#define RELEASE_CL_MEM(A) \ clReleaseMemObject(buf##A) +#define RELEASE_CL_MEM(A) clReleaseMemObject(buf##A) #define ENQUEUE_CL_BUFFER(FLAG, A, M, K) \ - CLBLAS_CHECK(clEnqueue##FLAG##Buffer( - Caffe::opencl_queue(), bufA, CL_TRUE, 0, M * K * sizeof(*A), + CL_CHECK(clEnqueue##FLAG##Buffer( \ + OpenCLDevice::queue(), bufA, CL_TRUE, 0, M * K * sizeof(*A), \ A, 0, NULL, NULL)); #define PRE_CLBLAS_CALL \ cl_uint num_command_queues = 1; \ cl_uint num_events_in_wait_list = 0; \ cl_event *event_wait_list = NULL; \ - cl_event events = NULL + cl_event events = NULL; \ + cl_command_queue queue = OpenCLDevice::queue(); #define ARRAY(A) buf##A, 0, ld##A #define CLBALS_TRAILING_ARGS \ - num_command_queues, Caffe::opencl_queue(), num_events_in_wait_list, \ + num_command_queues, &queue, num_events_in_wait_list, \ event_wait_list, &events const char* clGetErrorString(cl_int error); -const char* clblasGetErrorString(clblasStatus_t status); +const char* clblasGetErrorString(clblasStatus status); inline clblasTranspose to_clblasTranspose(const CBLAS_TRANSPOSE trans) { switch (trans) { @@ -80,72 +82,6 @@ inline clblasTranspose to_clblasTranspose(const CBLAS_TRANSPOSE trans) { i < (n); \ i += get_global_size(0)) -#define DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(name, operation) \ -template \ -__kernel void name##_kernel(const int n, const Dtype* x, Dtype* y) { \ - OPENCL_KERNEL_LOOP(index, n) { \ - operation; \ - } \ -} \ -template <> \ -void caffe_opencl_##name(const int n, const float* x, float* y) { \ - /* NOLINT_NEXT_LINE(whitespace/operators) */ \ - name##_kernel<<>>( \ - n, x, y); \ -} \ -template <> \ -void caffe_opencl_##name(const int n, const double* x, double* y) { \ - /* NOLINT_NEXT_LINE(whitespace/operators) */ \ - name##_kernel<<>>( \ - n, x, y); \ -} \ -template \ -void OpenCLDevice::name(const int N, const Dtype* x, Dtype* y) { \ - caffe_opencl_##name(N, x, y); \ -} \ -template \ -void OpenCLDevice::name(const int N, const float* x, float* y); \ -template \ -void OpenCLDevice::name(const int N, const double* x, double* y); - - -#define DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(name, operation) \ -template \ -__kernel void name##_kernel(__global const int n, __global const Dtype* a, \ - __global const Dtype* b, __global Dtype* y) { \ - OPENCL_KERNEL_LOOP(i, n) { \ - operation; \ - } \ -} \ -template <> \ -void caffe_opencl_##name( \ - __global const int N, __global const float* a, \ - __global const float* b, __global float* y) { \ - /* NOLINT_NEXT_LINE(whitespace/operators) */ \ - name##_kernel<<>>( \ - N, a, b, y); \ -} \ -template <> \ -void caffe_opencl_##name( \ - __global const int N, __global const double* a, \ - __global const double* b, __global double* y) { \ - /* NOLINT_NEXT_LINE(whitespace/operators) */ \ - name##_kernel<<>>( \ - N, a, b, y); \ -} \ -template \ -void OpenCLDevice::name(const int N, const Dtype* a, const Dtype* b, \ - Dtype* y) { \ - caffe_opencl_##name(N, x, y); \ -} \ -template \ -void OpenCLDevice::name(const int N, const float* a, const float* b, \ - float* y); \ -template \ -void OpenCLDevice::name(const int N, const double* a, \ - const double* b, double* y); - - template void caffe_opencl_sqr(const int n, const Dtype* x, Dtype* y); @@ -207,15 +143,15 @@ class OpenCLDevice : public Device { virtual void scal(const int N, const Dtype alpha, Dtype *X); - virtual void sqr(const int N, const Dtype* a, Dtype* y); - - virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y); - - virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); - - virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); - - virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y); +// virtual void sqr(const int N, const Dtype* a, Dtype* y); +// +// virtual void add(const int N, const Dtype* a, const Dtype* b, Dtype* y); +// +// virtual void sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); +// +// virtual void mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); +// +// virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y); virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y); @@ -226,7 +162,7 @@ class OpenCLDevice : public Device { virtual void rng_bernoulli(const int N, const Dtype p, int* r); - virtual void exp(const int N, const Dtype* a, Dtype* y); +// virtual void exp(const int N, const Dtype* a, Dtype* y); virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out); @@ -236,11 +172,11 @@ class OpenCLDevice : public Device { // Returns the sum of the absolute values of the elements of vector x virtual void asum(const int N, const Dtype* x, Dtype* y); - virtual void sign(const int N, const Dtype* x, Dtype* y); +// virtual void sign(const int N, const Dtype* x, Dtype* y); - virtual void sgnbit(const int N, const Dtype* x, Dtype* y); +// virtual void sgnbit(const int N, const Dtype* x, Dtype* y); - virtual void fabs(const int N, const Dtype* x, Dtype* y); +// virtual void fabs(const int N, const Dtype* x, Dtype* y); virtual void scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y); @@ -252,9 +188,13 @@ class OpenCLDevice : public Device { const int height, const int width, const int psize, const int pad, const int stride, Dtype* data_im); - inline static cl_command_queue queue(); + static cl_context context(); + static cl_command_queue queue(); private: + static cl_device_id cl_device_id_; + static cl_context cl_context_; static cl_command_queue cl_command_queue_; + static bool cl_context_created_; static bool cl_command_queue_created_; }; diff --git a/src/caffe/util/opencl_device.cl b/src/caffe/util/opencl_device.cl new file mode 100644 index 00000000000..77e4a5e8778 --- /dev/null +++ b/src/caffe/util/opencl_device.cl @@ -0,0 +1,86 @@ +// Copyright 2014 BVLC and contributors. + +#include "caffe/common.hpp" +#include "caffe/util/opencl_device.hpp" + +namespace caffe { + + +#define DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(name, operation) \ +template \ +__kernel void name##_kernel(const int n, const Dtype* x, Dtype* y) { \ + OPENCL_KERNEL_LOOP(index, n) { \ + operation; \ + } \ +} \ +template <> \ +void caffe_opencl_##name(const int n, const float* x, float* y) { \ + /* NOLINT_NEXT_LINE(whitespace/operators) */ \ + name##_kernel<<>>( \ + n, x, y); \ +} \ +template <> \ +void caffe_opencl_##name(const int n, const double* x, double* y) { \ + /* NOLINT_NEXT_LINE(whitespace/operators) */ \ + name##_kernel<<>>( \ + n, x, y); \ +} \ +template \ +void OpenCLDevice::name(const int N, const Dtype* x, Dtype* y) { \ + caffe_opencl_##name(N, x, y); \ +} \ +template \ +void OpenCLDevice::name(const int N, const float* x, float* y); \ +template \ +void OpenCLDevice::name(const int N, const double* x, double* y); + + +#define DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(name, operation) \ +template \ +__kernel void name##_kernel(__global const int n, __global const Dtype* a, \ + __global const Dtype* b, __global Dtype* y) { \ + OPENCL_KERNEL_LOOP(i, n) { \ + operation; \ + } \ +} \ +template <> \ +void caffe_opencl_##name( \ + __global const int N, __global const float* a, \ + __global const float* b, __global float* y) { \ + /* NOLINT_NEXT_LINE(whitespace/operators) */ \ + name##_kernel<<>>( \ + N, a, b, y); \ +} \ +template <> \ +void caffe_opencl_##name( \ + __global const int N, __global const double* a, \ + __global const double* b, __global double* y) { \ + /* NOLINT_NEXT_LINE(whitespace/operators) */ \ + name##_kernel<<>>( \ + N, a, b, y); \ +} \ +template \ +void OpenCLDevice::name(const int N, const Dtype* a, const Dtype* b, \ + Dtype* y) { \ + caffe_opencl_##name(N, x, y); \ +} \ +template \ +void OpenCLDevice::name(const int N, const float* a, const float* b, \ + float* y); \ +template \ +void OpenCLDevice::name(const int N, const double* a, \ + const double* b, double* y); + + +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sqr, y[i] = x[i] * x[i]); +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(exp, y[i] = exp(x[i])); +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sign, y[i] = sign(x[i])); +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sgnbit, y[i] = signbit(x[i])); +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(fabs, y[i] = fabs(x[i])); + +DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(add, y[i] = a[i] + b[i]); +DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(sub, y[i] = a[i] - b[i]); +DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(mul, y[i] = a[i] * b[i]); +DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(div, y[i] = a[i] / b[i]); + +} // namespace caffe diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index b4299e01784..8e3153e0653 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -3,59 +3,67 @@ #include "caffe/common.hpp" #include "caffe/util/opencl_device.hpp" +#include + namespace caffe { template -OpenCLDevice::cl_command_queue_created_ = false; +cl_device_id OpenCLDevice::cl_device_id_ = NULL; + +template +cl_context OpenCLDevice::cl_context_ = NULL; + +template +cl_command_queue OpenCLDevice::cl_command_queue_ = NULL; + +template +bool OpenCLDevice::cl_command_queue_created_ = false; + +template +bool OpenCLDevice::cl_context_created_ = false; /** - * http://opencl.codeplex.com/wikipage?title=OpenCL%20Tutorials%20-%201 + * http://dhruba.name/2012/10/14/opencl-cookbook-how-to-leverage-multiple-devices-in-opencl/ */ template -OpenCLDevice::cl_command_queue queue() { - if (cl_command_queue_created_) { - return cl_command_queue_; - } else { +cl_context OpenCLDevice::context() { + if (!cl_context_created_) { cl_int error = 0; // Used to handle error codes cl_platform_id platform; - cl_context context; - cl_command_queue queue; - cl_device_id device; // Platform - CL_CHECK(oclGetPlatformID(&platform)); + CL_CHECK(clGetPlatformIDs(1, &platform, NULL)); // Device - CL_CHECK(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL)); + CL_CHECK(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &cl_device_id_, + NULL)); // Context - context = clCreateContext(0, 1, &device, NULL, NULL, &error); + cl_context_ = clCreateContext(0, 1, &cl_device_id_, NULL, NULL, &error); CL_CHECK(error); - // Command-queue - queue = clCreateCommandQueue(context, device, 0, &error); + cl_context_created_ = true; + } + return cl_context_; +} + +template +cl_command_queue OpenCLDevice::queue() { + if (!cl_command_queue_created_) { + cl_int error = 0; // Used to handle error codes + cl_command_queue_ = clCreateCommandQueue(context(), cl_device_id_, 0, + &error); CL_CHECK(error); cl_command_queue_created_ = true; } + return cl_command_queue_; } -DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sqr, y[i] = x[i] * x[i]); -DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(exp, y[i] = exp(x[i])); -DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sign, y[i] = sign(x[i])); -DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sgnbit, y[i] = signbit(x[i])); -DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(fabs, y[i] = fabs(x[i])); - -DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(add, y[i] = a[i] + b[i]); -DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(sub, y[i] = a[i] - b[i]); -DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(mul, y[i] = a[i] * b[i]); -DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(div, y[i] = a[i] / b[i]); - template <> void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C) { - // Note that cublas follows fortran order. - LEAD_DIM(A, M, K); - LEAD_DIM(B, K, N); - LEAD_DIM(C, M, N); + int ldA = (TransA == CblasNoTrans) ? K : M; + int ldB = (TransB == CblasNoTrans) ? N : K; + int ldC = N; clblasTranspose clTransA = to_clblasTranspose(TransA); clblasTranspose clTransB = to_clblasTranspose(TransB); CREATE_CL_MEM(A, M, K, READ_ONLY); @@ -67,7 +75,7 @@ void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, PRE_CLBLAS_CALL; // bufX is defined by the macro CREATE_CL_MEM(X, ...) CLBLAS_CHECK(clblasSgemm(clblasRowMajor, clTransA, clTransB, - M, N, K, &alpha, ARRAY(A), ARRAY(B), &beta, ARRAY(C), + M, N, K, alpha, ARRAY(A), ARRAY(B), beta, ARRAY(C), CLBALS_TRAILING_ARGS)); /* Release OpenCL memory objects. */ RELEASE_CL_MEM(C); @@ -81,10 +89,9 @@ void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, double* C) { - // Note that cublas follows fortran order. - LEAD_DIM(A, M, K); - LEAD_DIM(B, K, N); - LEAD_DIM(C, M, N); + int ldA = (TransA == CblasNoTrans) ? K : M; + int ldB = (TransB == CblasNoTrans) ? N : K; + int ldC = N; clblasTranspose clTransA = to_clblasTranspose(TransA); clblasTranspose clTransB = to_clblasTranspose(TransB); CREATE_CL_MEM(A, M, K, READ_ONLY); @@ -96,7 +103,7 @@ void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, PRE_CLBLAS_CALL; // bufX is defined by the macro CREATE_CL_MEM(X, ...) CLBLAS_CHECK(clblasDgemm(clblasRowMajor, clTransA, clTransB, - M, N, K, &alpha, ARRAY(A), ARRAY(B), &beta, ARRAY(C), + M, N, K, alpha, ARRAY(A), ARRAY(B), beta, ARRAY(C), CLBALS_TRAILING_ARGS)); /* Release OpenCL memory objects. */ RELEASE_CL_MEM(C); @@ -109,11 +116,15 @@ void OpenCLDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float* A, const float* x, const float beta, float* y) { clblasTranspose clTransA = to_clblasTranspose(TransA); + int ldA = (TransA == CblasNoTrans) ? N : M; + int ldx = N; + int ldy = N; CREATE_CL_MEM(A, M, N, READ_ONLY); CREATE_CL_MEM(x, N, 1, READ_ONLY); CREATE_CL_MEM(y, M, 1, READ_WRITE); - CLBLAS_CHECK(clblasSgemv(clblasRowMajor, clTransA, M, N, &alpha, - ARRAY(A), ARRAY(x), &beta, ARRAY(y), + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasSgemv(clblasRowMajor, clTransA, M, N, alpha, + ARRAY(A), ARRAY(x), beta, ARRAY(y), CLBALS_TRAILING_ARGS)); } @@ -123,33 +134,61 @@ void OpenCLDevice::gemv( const int N, const double alpha, const double* A, const double* x, const double beta, double* y) { clblasTranspose clTransA = to_clblasTranspose(TransA); + int ldA = (TransA == CblasNoTrans) ? N : M; + int ldx = N; + int ldy = N; CREATE_CL_MEM(A, M, N, READ_ONLY); CREATE_CL_MEM(x, N, 1, READ_ONLY); CREATE_CL_MEM(y, M, 1, READ_WRITE); - CLBLAS_CHECK(clblasDgemv(clblasRowMajor, clTransA, M, N, &alpha, - ARRAY(A), ARRAY(x), &beta, ARRAY(y), + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasDgemv(clblasRowMajor, clTransA, M, N, alpha, + ARRAY(A), ARRAY(x), beta, ARRAY(y), CLBALS_TRAILING_ARGS)); } template <> void OpenCLDevice::axpy(const int N, const float alpha, const float* X, float* Y) { + int ldX = N; + int ldY = N; CREATE_CL_MEM(X, N, 1, READ_ONLY); CREATE_CL_MEM(Y, N, 1, READ_WRITE); PRE_CLBLAS_CALL; - CUBLAS_CHECK(clblasSaxpy( - N, &alpha, ARRAY(X), ARRAY(Y), + CLBLAS_CHECK(clblasSaxpy( + N, alpha, ARRAY(X), ARRAY(Y), CLBALS_TRAILING_ARGS)); } template <> void OpenCLDevice::axpy(const int N, const double alpha, const double* X, double* Y) { + int ldX = N; + int ldY = N; CREATE_CL_MEM(X, N, 1, READ_ONLY); CREATE_CL_MEM(Y, N, 1, READ_WRITE); PRE_CLBLAS_CALL; - CUBLAS_CHECK(clblasDaxpy( - N, &alpha, ARRAY(X), ARRAY(Y), + CLBLAS_CHECK(clblasDaxpy( + N, alpha, ARRAY(X), ARRAY(Y), + CLBALS_TRAILING_ARGS)); +} + +template <> +void OpenCLDevice::scal(const int N, const float alpha, float *X) { + int ldX = N; + CREATE_CL_MEM(X, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasSscal( + N, alpha, ARRAY(X), + CLBALS_TRAILING_ARGS)); +} + +template <> +void OpenCLDevice::scal(const int N, const double alpha, double *X) { + int ldX = N; + CREATE_CL_MEM(X, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasDscal( + N, alpha, ARRAY(X), CLBALS_TRAILING_ARGS)); } @@ -157,20 +196,22 @@ template <> void OpenCLDevice::axpby( const int N, const float alpha, const float* X, const float beta, float* Y) { - this->scal(N, beta, Y); - this->axpy(N, alpha, X, Y); + this->scal(N, beta, Y); + this->axpy(N, alpha, X, Y); } template <> void OpenCLDevice::axpby( const int N, const double alpha, const double* X, const double beta, double* Y) { - this->scal(N, beta, Y); - this->axpy(N, alpha, X, Y); + this->scal(N, beta, Y); + this->axpy(N, alpha, X, Y); } template <> void OpenCLDevice::copy(const int N, const float *X, float *Y) { + int ldX = N; + int ldY = N; CREATE_CL_MEM(X, N, 1, READ_ONLY); CREATE_CL_MEM(Y, N, 1, READ_WRITE); PRE_CLBLAS_CALL; @@ -181,6 +222,8 @@ void OpenCLDevice::copy(const int N, const float *X, float *Y) { template <> void OpenCLDevice::copy(const int N, const double *X, double *Y) { + int ldX = N; + int ldY = N; CREATE_CL_MEM(X, N, 1, READ_ONLY); CREATE_CL_MEM(Y, N, 1, READ_WRITE); PRE_CLBLAS_CALL; @@ -217,13 +260,18 @@ void OpenCLDevice::copy_from_cpu(const int N, const double *X, */ template void OpenCLDevice::set(const int N, const Dtype alpha, Dtype *X) { +#ifdef CL_VERSION_1_2 CREATE_CL_MEM(X, N, 1, READ_WRITE); cl_uint num_events_in_wait_list = 0; cl_event *event_wait_list = NULL; - cl_event events = NULL; + cl_event event = NULL; CL_CHECK(clEnqueueFillBuffer( - OpenCLDevice::queue(), bufA, &alpha, sizeof(Dtype), 0, - sizeof(Dtype) * N, num_events_in_wait_list, event_wait_list, &event)); + OpenCLDevice::queue(), bufX, static_cast(&alpha), sizeof(Dtype), + 0, sizeof(Dtype) * N, num_events_in_wait_list, event_wait_list, &event)); +#else + std::vector tmp(N, alpha); + copy_from_cpu(N, &tmp[0], X); +#endif } template @@ -237,23 +285,12 @@ void OpenCLDevice::add_scalar(const int N, const Dtype alpha, NOT_IMPLEMENTED; } -template <> -void OpenCLDevice::scal(const int N, const float alpha, float *X) { - CREATE_CL_MEM(X, N, 1, READ_WRITE); - PRE_CLBLAS_CALL; - CLBLAS_CHECK(clblasSscal( - N, alpha, ARRAY(X), - CLBALS_TRAILING_ARGS)); -} +template +void OpenCLDevice::add_scalar(const int N, const float alpha, float *X); +template +void OpenCLDevice::add_scalar(const int N, const double alpha, + double *X); -template <> -void OpenCLDevice::scal(const int N, const double alpha, double *X) { - CREATE_CL_MEM(X, N, 1, READ_WRITE); - PRE_CLBLAS_CALL; - CLBLAS_CHECK(clblasDscal( - N, alpha, ARRAY(X), - CLBALS_TRAILING_ARGS)); -} template void OpenCLDevice::powx(const int N, const Dtype* a, const Dtype b, @@ -262,6 +299,14 @@ void OpenCLDevice::powx(const int N, const Dtype* a, const Dtype b, // caffe_gpu_powx(N, a, b, y); } +template +void OpenCLDevice::powx(const int N, const float* a, const float b, + float *y); +template +void OpenCLDevice::powx(const int N, const double* a, + const double b, double *y); + + template void OpenCLDevice::rng_uniform(const int N, const Dtype a, const Dtype b, Dtype* r) { @@ -269,6 +314,13 @@ void OpenCLDevice::rng_uniform(const int N, const Dtype a, // caffe_gpu_rng_uniform(N, a, b, r); } +template +void OpenCLDevice::rng_uniform( + const int N, const float a, const float b, float* r); +template +void OpenCLDevice::rng_uniform( + const int N, const double a, const double b, double* r); + template void OpenCLDevice::rng_gaussian(const int N, const Dtype mu, const Dtype sigma, Dtype* r) { @@ -276,12 +328,24 @@ void OpenCLDevice::rng_gaussian(const int N, const Dtype mu, // caffe_gpu_rng_gaussian(N, mu, sigma, r); } +template +void OpenCLDevice::rng_gaussian( + const int N, const float mu, const float sigma, float* r); +template +void OpenCLDevice::rng_gaussian( + const int N, const double mu, const double sigma, double* r); + template void OpenCLDevice::rng_bernoulli(const int N, const Dtype p, int* r) { NOT_IMPLEMENTED; // caffe_gpu_rng_bernoulli(N, p, r); } +template +void OpenCLDevice::rng_bernoulli(const int N, const float p, int* r); +template +void OpenCLDevice::rng_bernoulli(const int N, const double p, int* r); + template void OpenCLDevice::dot(const int N, const Dtype* x, const Dtype* y, Dtype* out) { @@ -289,6 +353,13 @@ void OpenCLDevice::dot(const int N, const Dtype* x, const Dtype* y, // caffe_gpu_dot(N, x, y, out); } +template +void OpenCLDevice::dot(const int N, const float* x, const float* y, + float* out); +template +void OpenCLDevice::dot(const int N, const double* x, const double* y, + double* out); + template void OpenCLDevice::hamming_distance(const int N, const Dtype* x, const Dtype* y, uint32_t* out) { @@ -296,6 +367,13 @@ void OpenCLDevice::hamming_distance(const int N, const Dtype* x, // *out = caffe_gpu_hamming_distance(N, x, y); } +template +void OpenCLDevice::hamming_distance(const int N, const float* x, + const float* y, uint32_t* out); +template +void OpenCLDevice::hamming_distance(const int N, const double* x, + const double* y, uint32_t* out); + /** * clblasSasum( @@ -323,11 +401,16 @@ void OpenCLDevice::asum(const int N, const Dtype* x, Dtype* y) { // CLBALS_TRAILING_ARGS)); } +template +void OpenCLDevice::asum(const int N, const float* x, float* y); +template +void OpenCLDevice::asum(const int N, const double* x, double* y); + template void OpenCLDevice::scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y) { - this->copy(N, x, y); - this->scal(N, alpha, y); + this->copy(N, x, y); + this->scal(N, alpha, y); } template @@ -338,23 +421,47 @@ void OpenCLDevice::scale(const int N, const double alpha, const double *x, double* y); template -void OpenCLDevice::im2col(const Dtype* data_im, const int channels, +void OpenCLDevice::im2col( + const Dtype* data_im, const int channels, const int height, const int width, const int ksize, const int pad, const int stride, Dtype* data_col) { - NOT_IMPLEMENTED; +// NOT_IMPLEMENTED; // im2col_gpu(data_im, channels, height, width, ksize, pad, stride, // data_col); } +template +void OpenCLDevice::im2col( + const float* data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, float* data_col); +template +void OpenCLDevice::im2col( + const double* data_im, const int channels, + const int height, const int width, const int ksize, const int pad, + const int stride, double* data_col); + template -void OpenCLDevice::col2im(const Dtype* data_col, const int channels, +void OpenCLDevice::col2im( + const Dtype* data_col, const int channels, const int height, const int width, const int psize, const int pad, const int stride, Dtype* data_im) { - NOT_IMPLEMENTED; +// NOT_IMPLEMENTED; // col2im_gpu(data_col, channels, height, width, psize, pad, stride, // data_im); } +template +void OpenCLDevice::col2im( + const float* data_col, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, float* data_im); +template +void OpenCLDevice::col2im( + const double* data_col, const int channels, + const int height, const int width, const int psize, const int pad, + const int stride, double* data_im); + const char* clGetErrorString(cl_int error) { switch (error) { case CL_SUCCESS: @@ -385,7 +492,7 @@ const char* clGetErrorString(cl_int error) { return "Unknown OpenCL error"; } -const char* clblasGetErrorString(clblasStatus_t status) { +const char* clblasGetErrorString(clblasStatus status) { switch (status) { case clblasSuccess: return "clblasSuccess"; From 18c136e4052905540698a42168e34be9460f2408 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Wed, 18 Jun 2014 13:12:12 +0800 Subject: [PATCH 59/75] Fix the rebase errors introduced when merge conflicts are resolved --- src/caffe/layers/hdf5_output_layer.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp index 2093b42d760..06b566ffe7d 100644 --- a/src/caffe/layers/hdf5_output_layer.cpp +++ b/src/caffe/layers/hdf5_output_layer.cpp @@ -73,12 +73,6 @@ Dtype HDF5OutputLayer::Forward(const vector*>& bottom, return Dtype(0.); } -template -void HDF5OutputLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - return; -} - INSTANTIATE_CLASS(HDF5OutputLayer); } // namespace caffe From 7b619ce626520f88a420f7f9b2244e829da125af Mon Sep 17 00:00:00 2001 From: Kai Li Date: Wed, 18 Jun 2014 16:08:38 +0800 Subject: [PATCH 60/75] OpenCLDevice supports multiple platforms and devices --- include/caffe/common.hpp | 2 +- include/caffe/util/opencl_device.hpp | 32 +++++-- src/caffe/util/device.cpp | 3 +- src/caffe/util/opencl_device.cpp | 119 ++++++++++++++++++++------- 4 files changed, 115 insertions(+), 41 deletions(-) diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index 722a7198bc6..52043838bf0 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -74,7 +74,7 @@ class Caffe { } return *singleton_; } - enum Brew { CPU, GPU, OPENCL }; + enum Brew { CPU, GPU, OPENCL_CPU, OPENCL_GPU }; enum Phase { TRAIN, TEST }; diff --git a/include/caffe/util/opencl_device.hpp b/include/caffe/util/opencl_device.hpp index 73ee4a024be..0910169e9f3 100644 --- a/include/caffe/util/opencl_device.hpp +++ b/include/caffe/util/opencl_device.hpp @@ -14,6 +14,8 @@ #include "caffe/util/device.hpp" +#include + namespace caffe { #define CL_CHECK(condition) \ @@ -116,8 +118,12 @@ void caffe_opencl_div(const int N, const Dtype* a, template class OpenCLDevice : public Device { public: - OpenCLDevice() { + OpenCLDevice() : + current_device_id_(0), current_cl_platform_id_(NULL), + current_platform_device_count_(0), current_platform_device_id_(0), + cl_context_(NULL), cl_command_queue_(NULL) { } + virtual ~OpenCLDevice() { } @@ -188,14 +194,22 @@ class OpenCLDevice : public Device { const int height, const int width, const int psize, const int pad, const int stride, Dtype* data_im); - static cl_context context(); - static cl_command_queue queue(); - private: - static cl_device_id cl_device_id_; - static cl_context cl_context_; - static cl_command_queue cl_command_queue_; - static bool cl_context_created_; - static bool cl_command_queue_created_; + void SetDevice(const int device_id); + inline cl_context context(); + inline cl_command_queue queue(); + protected: + cl_device_type get_device_type(); + cl_device_id current_cl_device_id(); + void release_context(); + void release_queue(); + protected: + int current_device_id_; + cl_platform_id current_cl_platform_id_; + cl_int current_platform_device_count_; + std::vector current_platform_device_ids_; + int current_platform_device_id_; + cl_context cl_context_; + cl_command_queue cl_command_queue_; }; diff --git a/src/caffe/util/device.cpp b/src/caffe/util/device.cpp index 81c2fa83cdd..bb25372122f 100644 --- a/src/caffe/util/device.cpp +++ b/src/caffe/util/device.cpp @@ -16,7 +16,8 @@ Device* DeviceFactory::GetDevice() { case Caffe::GPU: return gpu_device_; #ifdef USE_OPENCL - case Caffe::OPENCL: + case Caffe::OPENCL_CPU: + case Caffe::OPENCL_GPU: return opencl_device_; #endif default: diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index 8e3153e0653..e5774754d81 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -8,53 +8,112 @@ namespace caffe { template -cl_device_id OpenCLDevice::cl_device_id_ = NULL; - -template -cl_context OpenCLDevice::cl_context_ = NULL; - -template -cl_command_queue OpenCLDevice::cl_command_queue_ = NULL; - -template -bool OpenCLDevice::cl_command_queue_created_ = false; - -template -bool OpenCLDevice::cl_context_created_ = false; +cl_device_type OpenCLDevice::get_device_type() { + switch (Caffe::mode()) { + case Caffe::OPENCL_CPU: + return CL_DEVICE_TYPE_CPU; + case Caffe::OPENCL_GPU: + return CL_DEVICE_TYPE_GPU; + default: + LOG(FATAL) << "Unknown Caffe OpenCL mode."; + return CL_DEVICE_TYPE_DEFAULT; + } +} /** - * http://dhruba.name/2012/10/14/opencl-cookbook-how-to-leverage-multiple-devices-in-opencl/ + * http://dhruba.name/2012/08/14/opencl-cookbook-listing-all-devices-and-their-critical-attributes/ */ template cl_context OpenCLDevice::context() { - if (!cl_context_created_) { - cl_int error = 0; // Used to handle error codes - cl_platform_id platform; - // Platform - CL_CHECK(clGetPlatformIDs(1, &platform, NULL)); - // Device - CL_CHECK(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &cl_device_id_, - NULL)); - // Context - cl_context_ = clCreateContext(0, 1, &cl_device_id_, NULL, NULL, &error); - CL_CHECK(error); - cl_context_created_ = true; + if (cl_context_ == NULL) { + cl_uint platformCount; + CL_CHECK(clGetPlatformIDs(0, NULL, &platformCount)); + + cl_platform_id* platforms = (cl_platform_id*) + malloc(sizeof(cl_platform_id) * platformCount); + CL_CHECK(clGetPlatformIDs(1, platforms, NULL)); + + cl_uint deviceCount; + cl_device_type device_type = get_device_type(); + int num_devices_to_skip = current_device_id_; + while (num_devices_to_skip >= 0) { + for (int i = 0; i < platformCount; i++) { + cl_context_properties properties[] = { + CL_CONTEXT_PLATFORM, (cl_context_properties)( + platforms[i]), 0}; + // get all devices + clGetDeviceIDs(platforms[i], device_type, 0, NULL, &deviceCount); + if (num_devices_to_skip <= deviceCount) { + current_cl_platform_id_ = platforms[i]; + current_platform_device_count_ = deviceCount; + current_platform_device_id_ = num_devices_to_skip; + current_platform_device_ids_.resize(deviceCount); + CL_CHECK(clGetDeviceIDs(current_cl_platform_id_, device_type, + current_platform_device_count_, + &(current_platform_device_ids_[0]), NULL)); + cl_int error = CL_SUCCESS; // Used to handle error codes + // TODO: clCreateContext or clCreateContextFromType? + /* + * http://dhruba.name/2012/10/14/opencl-cookbook-how-to-leverage-multiple-devices-in-opencl/ + */ + // cl_context_ = clCreateContext(properties, deviceCount, devices, NULL, + // NULL, &error); + cl_context_ = clCreateContextFromType(properties, device_type, NULL, + NULL, &error); + CL_CHECK(error); + } + num_devices_to_skip -= deviceCount; + if (num_devices_to_skip < 0) { + break; + } + } + } } return cl_context_; } +template +cl_device_id OpenCLDevice::current_cl_device_id() { + // To initialize current platform info + context(); + return current_platform_device_ids_[current_platform_device_id_]; +} + template cl_command_queue OpenCLDevice::queue() { - if (!cl_command_queue_created_) { + if (cl_command_queue_ == NULL) { cl_int error = 0; // Used to handle error codes - cl_command_queue_ = clCreateCommandQueue(context(), cl_device_id_, 0, - &error); + cl_command_queue_properties properties = 0; + cl_command_queue_ = clCreateCommandQueue( + context(), current_cl_device_id(), properties, &error); CL_CHECK(error); - cl_command_queue_created_ = true; } return cl_command_queue_; } +template +void OpenCLDevice::release_context() { + CL_CHECK(clReleaseContext(cl_context_)); + cl_context_ = NULL; +} + +template +void OpenCLDevice::release_queue() { + CL_CHECK(clReleaseCommandQueue(cl_command_queue_)); + cl_command_queue_ = NULL; +} + +template +void OpenCLDevice::SetDevice(const int device_id) { + if (current_device_id_ != device_id) { + current_device_id_ = device_id; + release_queue(); + // TODO: reuse context for the devices of the same platform + release_context(); + context(); + } +} + template <> void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, From 22acd5dcbeb2f202710325163a0edb4fc51eb312 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Wed, 18 Jun 2014 16:44:14 +0800 Subject: [PATCH 61/75] Initialize and finalize clBLAS in the OpenCLDevice --- include/caffe/util/opencl_device.hpp | 6 +++++- src/caffe/util/opencl_device.cpp | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/include/caffe/util/opencl_device.hpp b/include/caffe/util/opencl_device.hpp index 0910169e9f3..f88ea305f96 100644 --- a/include/caffe/util/opencl_device.hpp +++ b/include/caffe/util/opencl_device.hpp @@ -121,7 +121,8 @@ class OpenCLDevice : public Device { OpenCLDevice() : current_device_id_(0), current_cl_platform_id_(NULL), current_platform_device_count_(0), current_platform_device_id_(0), - cl_context_(NULL), cl_command_queue_(NULL) { + cl_context_(NULL), cl_command_queue_(NULL), clblas_initialized_(false) { + initialize_clblas(); } virtual ~OpenCLDevice() { @@ -202,6 +203,8 @@ class OpenCLDevice : public Device { cl_device_id current_cl_device_id(); void release_context(); void release_queue(); + void initialize_clblas(); + void finalize_clblas(); protected: int current_device_id_; cl_platform_id current_cl_platform_id_; @@ -210,6 +213,7 @@ class OpenCLDevice : public Device { int current_platform_device_id_; cl_context cl_context_; cl_command_queue cl_command_queue_; + bool clblas_initialized_; }; diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index e5774754d81..7c5baae21c4 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -111,6 +111,24 @@ void OpenCLDevice::SetDevice(const int device_id) { // TODO: reuse context for the devices of the same platform release_context(); context(); + finalize_clblas(); + initialize_clblas(); + } +} + +template +void OpenCLDevice::initialize_clblas() { + if (!clblas_initialized_) { + CLBLAS_CHECK(clblasSetup()); + clblas_initialized_ = true; + } +} + +template +void OpenCLDevice::finalize_clblas() { + if (clblas_initialized_) { + clblasTeardown(); + clblas_initialized_ = false; } } From ba4cf00d6de07be60396d88616e9dd89109f978d Mon Sep 17 00:00:00 2001 From: Kai Li Date: Wed, 18 Jun 2014 18:07:20 +0800 Subject: [PATCH 62/75] Split math functions and global device statuses out of OpenCLDevice --- include/caffe/util/opencl_device.hpp | 187 +++--- include/caffe/util/opencl_math_functions.hpp | 134 +++++ src/caffe/common.cpp | 1 - src/caffe/util/opencl_device.cpp | 585 ++++++------------- src/caffe/util/opencl_math_functions.cpp | 220 +++++++ 5 files changed, 591 insertions(+), 536 deletions(-) create mode 100644 include/caffe/util/opencl_math_functions.hpp create mode 100644 src/caffe/util/opencl_math_functions.cpp diff --git a/include/caffe/util/opencl_device.hpp b/include/caffe/util/opencl_device.hpp index f88ea305f96..23b41a158d0 100644 --- a/include/caffe/util/opencl_device.hpp +++ b/include/caffe/util/opencl_device.hpp @@ -32,97 +32,76 @@ namespace caffe { << caffe::clblasGetErrorString(status); \ } while (0) -#define CREATE_CL_MEM(A, M, K, FLAG) \ - cl_mem buf##A; \ - do { \ - cl_int error; \ - buf##A = clCreateBuffer( \ - OpenCLDevice::context(), CL_MEM_##FLAG, M * K * sizeof(*A), \ - NULL, &error); \ - CL_CHECK(error); \ - } while(0) - -#define RELEASE_CL_MEM(A) clReleaseMemObject(buf##A) - -#define ENQUEUE_CL_BUFFER(FLAG, A, M, K) \ - CL_CHECK(clEnqueue##FLAG##Buffer( \ - OpenCLDevice::queue(), bufA, CL_TRUE, 0, M * K * sizeof(*A), \ - A, 0, NULL, NULL)); - -#define PRE_CLBLAS_CALL \ - cl_uint num_command_queues = 1; \ - cl_uint num_events_in_wait_list = 0; \ - cl_event *event_wait_list = NULL; \ - cl_event events = NULL; \ - cl_command_queue queue = OpenCLDevice::queue(); - -#define ARRAY(A) buf##A, 0, ld##A - -#define CLBALS_TRAILING_ARGS \ - num_command_queues, &queue, num_events_in_wait_list, \ - event_wait_list, &events - const char* clGetErrorString(cl_int error); const char* clblasGetErrorString(clblasStatus status); -inline clblasTranspose to_clblasTranspose(const CBLAS_TRANSPOSE trans) { - switch (trans) { - case CblasNoTrans: - return clblasNoTrans; - case CblasTrans: - return clblasTrans; - case CblasConjTrans: - return clblasConjTrans; - default: - LOG(FATAL) << "Unknown CBLAS_TRANSPOSE " << trans; - } -} - // OpenCL: grid stride looping #define OPENCL_KERNEL_LOOP(i, n) \ for (int i = get_global_id(0); \ i < (n); \ i += get_global_size(0)) -template -void caffe_opencl_sqr(const int n, const Dtype* x, Dtype* y); - -template -void caffe_opencl_exp(const int n, const Dtype* x, Dtype* y); - -template -void caffe_opencl_sign(const int n, const Dtype* x, Dtype* y); - -template -void caffe_opencl_sgnbit(const int n, const Dtype* x, Dtype* y); +class CaffeOpenCL { + public: + inline static CaffeOpenCL& Get() { + if (!singleton_.get()) { + singleton_.reset(new CaffeOpenCL()); + } + return *singleton_; + } -template -void caffe_opencl_fabs(const int n, const Dtype* x, Dtype* y); + virtual ~CaffeOpenCL() { + } -template -void caffe_opencl_add(const int N, const Dtype* a, - const Dtype* b, Dtype* y); + void SetDevice(const int device_id); + inline static cl_context context() { + if (Get().cl_context_ == NULL) { + Get().create_context(); + } + return Get().cl_context_; + } + inline static cl_command_queue queue() { + if (Get().cl_command_queue_ == NULL) { + Get().create_queue(); + } + return Get().cl_command_queue_; + } + protected: + cl_device_type get_device_type(); + cl_device_id current_cl_device_id(); + void create_context(); + void release_context(); + void create_queue(); + void release_queue(); + void initialize_clblas(); + void finalize_clblas(); + protected: + static shared_ptr singleton_; -template -void caffe_opencl_sub(const int N, const Dtype* a, - const Dtype* b, Dtype* y); + int current_device_id_; + cl_platform_id current_cl_platform_id_; + cl_int current_platform_device_count_; + std::vector current_platform_device_ids_; + int current_platform_device_id_; + cl_context cl_context_; + cl_command_queue cl_command_queue_; + bool clblas_initialized_; + private: + CaffeOpenCL() : + current_device_id_(0), current_cl_platform_id_(NULL), + current_platform_device_count_(0), current_platform_device_id_(0), + cl_context_(NULL), cl_command_queue_(NULL), clblas_initialized_(false) { + initialize_clblas(); + } -template -void caffe_opencl_mul(const int N, const Dtype* a, - const Dtype* b, Dtype* y); + DISABLE_COPY_AND_ASSIGN(CaffeOpenCL); +}; -template -void caffe_opencl_div(const int N, const Dtype* a, - const Dtype* b, Dtype* y); template class OpenCLDevice : public Device { public: - OpenCLDevice() : - current_device_id_(0), current_cl_platform_id_(NULL), - current_platform_device_count_(0), current_platform_device_id_(0), - cl_context_(NULL), cl_command_queue_(NULL), clblas_initialized_(false) { - initialize_clblas(); + OpenCLDevice() : Device() { } virtual ~OpenCLDevice() { @@ -146,7 +125,7 @@ class OpenCLDevice : public Device { virtual void set(const int N, const Dtype alpha, Dtype *X); - virtual void add_scalar(const int N, const Dtype alpha, Dtype *X); +// virtual void add_scalar(const int N, const Dtype alpha, Dtype *X); virtual void scal(const int N, const Dtype alpha, Dtype *X); @@ -160,24 +139,24 @@ class OpenCLDevice : public Device { // // virtual void div(const int N, const Dtype* a, const Dtype* b, Dtype* y); - virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y); - - virtual void rng_uniform(const int N, const Dtype a, const Dtype b, Dtype* r); - - virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma, - Dtype* r); +// virtual void powx(const int N, const Dtype* a, const Dtype b, Dtype* y); - virtual void rng_bernoulli(const int N, const Dtype p, int* r); +// virtual void rng_uniform(const int N, const Dtype a, const Dtype b, Dtype* r); +// +// virtual void rng_gaussian(const int N, const Dtype mu, const Dtype sigma, +// Dtype* r); +// +// virtual void rng_bernoulli(const int N, const Dtype p, int* r); // virtual void exp(const int N, const Dtype* a, Dtype* y); - virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out); - - virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y, - uint32_t* out); +// virtual void dot(const int N, const Dtype* x, const Dtype* y, Dtype* out); +// +// virtual void hamming_distance(const int N, const Dtype* x, const Dtype* y, +// uint32_t* out); // Returns the sum of the absolute values of the elements of vector x - virtual void asum(const int N, const Dtype* x, Dtype* y); +// virtual void asum(const int N, const Dtype* x, Dtype* y); // virtual void sign(const int N, const Dtype* x, Dtype* y); @@ -185,35 +164,15 @@ class OpenCLDevice : public Device { // virtual void fabs(const int N, const Dtype* x, Dtype* y); - virtual void scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y); - - virtual void im2col(const Dtype* data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col); +// virtual void scale(const int N, const Dtype alpha, const Dtype *x, Dtype* y); - virtual void col2im(const Dtype* data_col, const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, Dtype* data_im); - - void SetDevice(const int device_id); - inline cl_context context(); - inline cl_command_queue queue(); - protected: - cl_device_type get_device_type(); - cl_device_id current_cl_device_id(); - void release_context(); - void release_queue(); - void initialize_clblas(); - void finalize_clblas(); - protected: - int current_device_id_; - cl_platform_id current_cl_platform_id_; - cl_int current_platform_device_count_; - std::vector current_platform_device_ids_; - int current_platform_device_id_; - cl_context cl_context_; - cl_command_queue cl_command_queue_; - bool clblas_initialized_; +// virtual void im2col(const Dtype* data_im, const int channels, +// const int height, const int width, const int ksize, const int pad, +// const int stride, Dtype* data_col); +// +// virtual void col2im(const Dtype* data_col, const int channels, +// const int height, const int width, const int psize, const int pad, +// const int stride, Dtype* data_im); }; diff --git a/include/caffe/util/opencl_math_functions.hpp b/include/caffe/util/opencl_math_functions.hpp new file mode 100644 index 00000000000..27448d27c75 --- /dev/null +++ b/include/caffe/util/opencl_math_functions.hpp @@ -0,0 +1,134 @@ +// Copyright 2014 BVLC and contributors. + +#ifndef CAFFE_UTIL_OPENCL_MATH_FUNCTIONS_H_ +#define CAFFE_UTIL_OPENCL_MATH_FUNCTIONS_H_ + +#include "caffe/util/opencl_device.hpp" + +namespace caffe { + +#define CREATE_CL_MEM(A, M, K, FLAG) \ + cl_mem buf##A; \ + do { \ + cl_int error; \ + buf##A = clCreateBuffer( \ + CaffeOpenCL::context(), CL_MEM_##FLAG, M * K * sizeof(*A), \ + NULL, &error); \ + CL_CHECK(error); \ + } while(0) + +#define RELEASE_CL_MEM(A) clReleaseMemObject(buf##A) + +#define ENQUEUE_CL_BUFFER(FLAG, A, M, K) \ + CL_CHECK(clEnqueue##FLAG##Buffer( \ + CaffeOpenCL::queue(), buf##A, CL_TRUE, 0, M * K * sizeof(*A), \ + A, 0, NULL, NULL)); + +#define PRE_CLBLAS_CALL \ + cl_uint num_command_queues = 1; \ + cl_uint num_events_in_wait_list = 0; \ + cl_event *event_wait_list = NULL; \ + cl_event events = NULL; \ + cl_command_queue queue = CaffeOpenCL::queue(); + +#define ARRAY(A) buf##A, 0, ld##A + +#define CLBALS_TRAILING_ARGS \ + num_command_queues, &queue, num_events_in_wait_list, \ + event_wait_list, &events + +inline clblasTranspose to_clblasTranspose(const CBLAS_TRANSPOSE trans) { + switch (trans) { + case CblasNoTrans: + return clblasNoTrans; + case CblasTrans: + return clblasTrans; + case CblasConjTrans: + return clblasConjTrans; + default: + LOG(FATAL) << "Unknown CBLAS_TRANSPOSE " << trans; + } +} + +template +void caffe_opencl_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C); + + +template +void caffe_opencl_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, + Dtype* y); + +template +void caffe_opencl_axpy(const int N, const Dtype alpha, const Dtype* X, + Dtype* Y); + +template +void caffe_opencl_axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y); + +template +void caffe_opencl_copy(const int N, const Dtype *X, Dtype *Y); + +template +void caffe_opencl_set(const int N, const Dtype alpha, Dtype *X); + +template +void caffe_opencl_add_scalar(const int N, const Dtype alpha, Dtype *X); + +template +void caffe_opencl_scal(const int N, const Dtype alpha, Dtype *X); + +template +Dtype caffe_opencl_dot(const int n, const Dtype* x, const Dtype* y); + +template +int caffe_opencl_hamming_distance(const int n, const Dtype* x, const Dtype* y); + +// Returns the sum of the absolute values of the elements of vector x +template +Dtype caffe_opencl_asum(const int n, const Dtype* x); + +template +void caffe_opencl_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); + +template +void caffe_opencl_copy_from_cpu(const int N, const Dtype *X, Dtype *Y); + +template +void caffe_opencl_sqr(const int n, const Dtype* x, Dtype* y); + +template +void caffe_opencl_exp(const int n, const Dtype* x, Dtype* y); + +template +void caffe_opencl_sign(const int n, const Dtype* x, Dtype* y); + +template +void caffe_opencl_sgnbit(const int n, const Dtype* x, Dtype* y); + +template +void caffe_opencl_fabs(const int n, const Dtype* x, Dtype* y); + +template +void caffe_opencl_add(const int N, const Dtype* a, + const Dtype* b, Dtype* y); + +template +void caffe_opencl_sub(const int N, const Dtype* a, + const Dtype* b, Dtype* y); + +template +void caffe_opencl_mul(const int N, const Dtype* a, + const Dtype* b, Dtype* y); + +template +void caffe_opencl_div(const int N, const Dtype* a, + const Dtype* b, Dtype* y); +} // namespace caffe + + +#endif // CAFFE_UTIL_MATH_FUNCTIONS_H_ diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index 631c8afd068..82d4f16cd70 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -10,7 +10,6 @@ namespace caffe { shared_ptr Caffe::singleton_; - // curand seeding int64_t cluster_seedgen(void) { int64_t s, seed, pid; diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index 7c5baae21c4..67b8326a5dc 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -2,13 +2,15 @@ #include "caffe/common.hpp" #include "caffe/util/opencl_device.hpp" +#include "caffe/util/opencl_math_functions.hpp" #include namespace caffe { -template -cl_device_type OpenCLDevice::get_device_type() { +shared_ptr CaffeOpenCL::singleton_; + +cl_device_type CaffeOpenCL::get_device_type() { switch (Caffe::mode()) { case Caffe::OPENCL_CPU: return CL_DEVICE_TYPE_CPU; @@ -23,88 +25,76 @@ cl_device_type OpenCLDevice::get_device_type() { /** * http://dhruba.name/2012/08/14/opencl-cookbook-listing-all-devices-and-their-critical-attributes/ */ -template -cl_context OpenCLDevice::context() { - if (cl_context_ == NULL) { - cl_uint platformCount; - CL_CHECK(clGetPlatformIDs(0, NULL, &platformCount)); - - cl_platform_id* platforms = (cl_platform_id*) - malloc(sizeof(cl_platform_id) * platformCount); - CL_CHECK(clGetPlatformIDs(1, platforms, NULL)); - - cl_uint deviceCount; - cl_device_type device_type = get_device_type(); - int num_devices_to_skip = current_device_id_; - while (num_devices_to_skip >= 0) { - for (int i = 0; i < platformCount; i++) { - cl_context_properties properties[] = { - CL_CONTEXT_PLATFORM, (cl_context_properties)( - platforms[i]), 0}; - // get all devices - clGetDeviceIDs(platforms[i], device_type, 0, NULL, &deviceCount); - if (num_devices_to_skip <= deviceCount) { - current_cl_platform_id_ = platforms[i]; - current_platform_device_count_ = deviceCount; - current_platform_device_id_ = num_devices_to_skip; - current_platform_device_ids_.resize(deviceCount); - CL_CHECK(clGetDeviceIDs(current_cl_platform_id_, device_type, - current_platform_device_count_, - &(current_platform_device_ids_[0]), NULL)); - cl_int error = CL_SUCCESS; // Used to handle error codes - // TODO: clCreateContext or clCreateContextFromType? - /* - * http://dhruba.name/2012/10/14/opencl-cookbook-how-to-leverage-multiple-devices-in-opencl/ - */ - // cl_context_ = clCreateContext(properties, deviceCount, devices, NULL, - // NULL, &error); - cl_context_ = clCreateContextFromType(properties, device_type, NULL, - NULL, &error); - CL_CHECK(error); - } - num_devices_to_skip -= deviceCount; - if (num_devices_to_skip < 0) { - break; - } +void CaffeOpenCL::create_context() { + cl_uint platformCount; + CL_CHECK(clGetPlatformIDs(0, NULL, &platformCount)); + + cl_platform_id* platforms = (cl_platform_id*) + malloc(sizeof(cl_platform_id) * platformCount); + CL_CHECK(clGetPlatformIDs(1, platforms, NULL)); + + cl_uint deviceCount; + cl_device_type device_type = get_device_type(); + int num_devices_to_skip = current_device_id_; + while (num_devices_to_skip >= 0) { + for (int i = 0; i < platformCount; i++) { + cl_context_properties properties[] = { + CL_CONTEXT_PLATFORM, (cl_context_properties)( + platforms[i]), 0}; + // get all devices + clGetDeviceIDs(platforms[i], device_type, 0, NULL, &deviceCount); + if (num_devices_to_skip <= deviceCount) { + current_cl_platform_id_ = platforms[i]; + current_platform_device_count_ = deviceCount; + current_platform_device_id_ = num_devices_to_skip; + current_platform_device_ids_.resize(deviceCount); + CL_CHECK(clGetDeviceIDs(current_cl_platform_id_, device_type, + current_platform_device_count_, + &(current_platform_device_ids_[0]), NULL)); + cl_int error = CL_SUCCESS; // Used to handle error codes + // TODO: clCreateContext or clCreateContextFromType? +/* + * http://dhruba.name/2012/10/14/opencl-cookbook-how-to-leverage-multiple-devices-in-opencl/ + */ +// cl_context_ = clCreateContext(properties, deviceCount, devices, NULL, +// NULL, &error); + cl_context_ = clCreateContextFromType(properties, device_type, NULL, + NULL, &error); + CL_CHECK(error); + } + num_devices_to_skip -= deviceCount; + if (num_devices_to_skip < 0) { + break; } } } - return cl_context_; } -template -cl_device_id OpenCLDevice::current_cl_device_id() { +cl_device_id CaffeOpenCL::current_cl_device_id() { // To initialize current platform info context(); return current_platform_device_ids_[current_platform_device_id_]; } -template -cl_command_queue OpenCLDevice::queue() { - if (cl_command_queue_ == NULL) { - cl_int error = 0; // Used to handle error codes - cl_command_queue_properties properties = 0; - cl_command_queue_ = clCreateCommandQueue( - context(), current_cl_device_id(), properties, &error); - CL_CHECK(error); - } - return cl_command_queue_; +void CaffeOpenCL::create_queue() { + cl_int error = 0; // Used to handle error codes + cl_command_queue_properties properties = 0; + cl_command_queue_ = clCreateCommandQueue( + context(), current_cl_device_id(), properties, &error); + CL_CHECK(error); } -template -void OpenCLDevice::release_context() { +void CaffeOpenCL::release_context() { CL_CHECK(clReleaseContext(cl_context_)); cl_context_ = NULL; } -template -void OpenCLDevice::release_queue() { +void CaffeOpenCL::release_queue() { CL_CHECK(clReleaseCommandQueue(cl_command_queue_)); cl_command_queue_ = NULL; } -template -void OpenCLDevice::SetDevice(const int device_id) { +void CaffeOpenCL::SetDevice(const int device_id) { if (current_device_id_ != device_id) { current_device_id_ = device_id; release_queue(); @@ -116,198 +106,59 @@ void OpenCLDevice::SetDevice(const int device_id) { } } -template -void OpenCLDevice::initialize_clblas() { +void CaffeOpenCL::initialize_clblas() { if (!clblas_initialized_) { CLBLAS_CHECK(clblasSetup()); clblas_initialized_ = true; } } -template -void OpenCLDevice::finalize_clblas() { +void CaffeOpenCL::finalize_clblas() { if (clblas_initialized_) { clblasTeardown(); clblas_initialized_ = false; } } -template <> -void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, - const int N, const int K, const float alpha, - const float* A, const float* B, - const float beta, float* C) { - int ldA = (TransA == CblasNoTrans) ? K : M; - int ldB = (TransB == CblasNoTrans) ? N : K; - int ldC = N; - clblasTranspose clTransA = to_clblasTranspose(TransA); - clblasTranspose clTransB = to_clblasTranspose(TransB); - CREATE_CL_MEM(A, M, K, READ_ONLY); - CREATE_CL_MEM(B, K, N, READ_ONLY); - CREATE_CL_MEM(C, M, N, READ_WRITE); - ENQUEUE_CL_BUFFER(Write, A, M, K); - ENQUEUE_CL_BUFFER(Write, B, K, N); - ENQUEUE_CL_BUFFER(Write, C, M, N); - PRE_CLBLAS_CALL; - // bufX is defined by the macro CREATE_CL_MEM(X, ...) - CLBLAS_CHECK(clblasSgemm(clblasRowMajor, clTransA, clTransB, - M, N, K, alpha, ARRAY(A), ARRAY(B), beta, ARRAY(C), - CLBALS_TRAILING_ARGS)); - /* Release OpenCL memory objects. */ - RELEASE_CL_MEM(C); - RELEASE_CL_MEM(B); - RELEASE_CL_MEM(A); -} - -template <> -void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, - const int N, const int K, const double alpha, - const double* A, const double* B, - const double beta, double* C) { - int ldA = (TransA == CblasNoTrans) ? K : M; - int ldB = (TransB == CblasNoTrans) ? N : K; - int ldC = N; - clblasTranspose clTransA = to_clblasTranspose(TransA); - clblasTranspose clTransB = to_clblasTranspose(TransB); - CREATE_CL_MEM(A, M, K, READ_ONLY); - CREATE_CL_MEM(B, K, N, READ_ONLY); - CREATE_CL_MEM(C, M, N, READ_WRITE); - ENQUEUE_CL_BUFFER(Write, A, M, K); - ENQUEUE_CL_BUFFER(Write, B, K, N); - ENQUEUE_CL_BUFFER(Write, C, M, N); - PRE_CLBLAS_CALL; - // bufX is defined by the macro CREATE_CL_MEM(X, ...) - CLBLAS_CHECK(clblasDgemm(clblasRowMajor, clTransA, clTransB, - M, N, K, alpha, ARRAY(A), ARRAY(B), beta, ARRAY(C), - CLBALS_TRAILING_ARGS)); - /* Release OpenCL memory objects. */ - RELEASE_CL_MEM(C); - RELEASE_CL_MEM(B); - RELEASE_CL_MEM(A); -} - -template <> -void OpenCLDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, - const int N, const float alpha, const float* A, - const float* x, const float beta, float* y) { - clblasTranspose clTransA = to_clblasTranspose(TransA); - int ldA = (TransA == CblasNoTrans) ? N : M; - int ldx = N; - int ldy = N; - CREATE_CL_MEM(A, M, N, READ_ONLY); - CREATE_CL_MEM(x, N, 1, READ_ONLY); - CREATE_CL_MEM(y, M, 1, READ_WRITE); - PRE_CLBLAS_CALL; - CLBLAS_CHECK(clblasSgemv(clblasRowMajor, clTransA, M, N, alpha, - ARRAY(A), ARRAY(x), beta, ARRAY(y), - CLBALS_TRAILING_ARGS)); -} - -template <> -void OpenCLDevice::gemv( - const CBLAS_TRANSPOSE TransA, const int M, - const int N, const double alpha, const double* A, - const double* x, const double beta, double* y) { - clblasTranspose clTransA = to_clblasTranspose(TransA); - int ldA = (TransA == CblasNoTrans) ? N : M; - int ldx = N; - int ldy = N; - CREATE_CL_MEM(A, M, N, READ_ONLY); - CREATE_CL_MEM(x, N, 1, READ_ONLY); - CREATE_CL_MEM(y, M, 1, READ_WRITE); - PRE_CLBLAS_CALL; - CLBLAS_CHECK(clblasDgemv(clblasRowMajor, clTransA, M, N, alpha, - ARRAY(A), ARRAY(x), beta, ARRAY(y), - CLBALS_TRAILING_ARGS)); -} - -template <> -void OpenCLDevice::axpy(const int N, const float alpha, - const float* X, float* Y) { - int ldX = N; - int ldY = N; - CREATE_CL_MEM(X, N, 1, READ_ONLY); - CREATE_CL_MEM(Y, N, 1, READ_WRITE); - PRE_CLBLAS_CALL; - CLBLAS_CHECK(clblasSaxpy( - N, alpha, ARRAY(X), ARRAY(Y), - CLBALS_TRAILING_ARGS)); -} - -template <> -void OpenCLDevice::axpy(const int N, const double alpha, - const double* X, double* Y) { - int ldX = N; - int ldY = N; - CREATE_CL_MEM(X, N, 1, READ_ONLY); - CREATE_CL_MEM(Y, N, 1, READ_WRITE); - PRE_CLBLAS_CALL; - CLBLAS_CHECK(clblasDaxpy( - N, alpha, ARRAY(X), ARRAY(Y), - CLBALS_TRAILING_ARGS)); +template +void OpenCLDevice::gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, + const int N, const int K, const Dtype alpha, + const Dtype* A, const Dtype* B, + const Dtype beta, Dtype* C) { + caffe_opencl_gemm(TransA, TransB, M, N, K, alpha, A, B, beta, C); } -template <> -void OpenCLDevice::scal(const int N, const float alpha, float *X) { - int ldX = N; - CREATE_CL_MEM(X, N, 1, READ_WRITE); - PRE_CLBLAS_CALL; - CLBLAS_CHECK(clblasSscal( - N, alpha, ARRAY(X), - CLBALS_TRAILING_ARGS)); +template +void OpenCLDevice::gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const Dtype alpha, const Dtype* A, + const Dtype* x, const Dtype beta, Dtype* y) { + caffe_opencl_gemv(TransA, M, N, alpha, A, x, beta, y); } -template <> -void OpenCLDevice::scal(const int N, const double alpha, double *X) { - int ldX = N; - CREATE_CL_MEM(X, N, 1, READ_WRITE); - PRE_CLBLAS_CALL; - CLBLAS_CHECK(clblasDscal( - N, alpha, ARRAY(X), - CLBALS_TRAILING_ARGS)); +template +void OpenCLDevice::axpy(const int N, const Dtype alpha, + const Dtype* X, Dtype* Y) { + caffe_opencl_axpy(N, alpha, X, Y); } -template <> -void OpenCLDevice::axpby( - const int N, const float alpha, const float* X, - const float beta, float* Y) { - this->scal(N, beta, Y); - this->axpy(N, alpha, X, Y); +template +void OpenCLDevice::scal(const int N, const Dtype alpha, Dtype *X) { + caffe_opencl_scal(N, alpha, X); } -template <> -void OpenCLDevice::axpby( - const int N, const double alpha, const double* X, - const double beta, double* Y) { - this->scal(N, beta, Y); - this->axpy(N, alpha, X, Y); +template +void OpenCLDevice::axpby( + const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y) { + caffe_opencl_axpby(N, alpha, X, beta, Y); } -template <> -void OpenCLDevice::copy(const int N, const float *X, float *Y) { - int ldX = N; - int ldY = N; - CREATE_CL_MEM(X, N, 1, READ_ONLY); - CREATE_CL_MEM(Y, N, 1, READ_WRITE); - PRE_CLBLAS_CALL; - CLBLAS_CHECK(clblasScopy( - N, ARRAY(X), ARRAY(Y), - CLBALS_TRAILING_ARGS)); +template +void OpenCLDevice::copy(const int N, const Dtype *X, Dtype *Y) { + caffe_opencl_copy(N, X, Y); } -template <> -void OpenCLDevice::copy(const int N, const double *X, double *Y) { - int ldX = N; - int ldY = N; - CREATE_CL_MEM(X, N, 1, READ_ONLY); - CREATE_CL_MEM(Y, N, 1, READ_WRITE); - PRE_CLBLAS_CALL; - CLBLAS_CHECK(clblasDcopy( - N, ARRAY(X), ARRAY(Y), - CLBALS_TRAILING_ARGS)); -} /** * http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clEnqueueWriteBuffer.html @@ -315,141 +166,64 @@ void OpenCLDevice::copy(const int N, const double *X, double *Y) { template void OpenCLDevice::copy_from_cpu(const int N, const Dtype *X, Dtype *Y) { - CREATE_CL_MEM(Y, N, 1, READ_WRITE); - cl_bool blocking_write = CL_TRUE; - cl_uint num_events_in_wait_list = 0; - cl_event *event_wait_list = NULL; - cl_event events = NULL; - CL_CHECK(clEnqueueWriteBuffer( - OpenCLDevice::queue(), bufY, blocking_write, 0, N * sizeof(Dtype), - X, num_events_in_wait_list, event_wait_list, &events)); + caffe_opencl_copy_from_cpu(N, X, Y); } -template -void OpenCLDevice::copy_from_cpu(const int N, const float *X, - float *Y); -template -void OpenCLDevice::copy_from_cpu(const int N, const double *X, - double *Y); - /** * http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clEnqueueFillBuffer.html */ template void OpenCLDevice::set(const int N, const Dtype alpha, Dtype *X) { -#ifdef CL_VERSION_1_2 - CREATE_CL_MEM(X, N, 1, READ_WRITE); - cl_uint num_events_in_wait_list = 0; - cl_event *event_wait_list = NULL; - cl_event event = NULL; - CL_CHECK(clEnqueueFillBuffer( - OpenCLDevice::queue(), bufX, static_cast(&alpha), sizeof(Dtype), - 0, sizeof(Dtype) * N, num_events_in_wait_list, event_wait_list, &event)); -#else - std::vector tmp(N, alpha); - copy_from_cpu(N, &tmp[0], X); -#endif -} - -template -void OpenCLDevice::set(const int N, const float alpha, float *X); -template -void OpenCLDevice::set(const int N, const double alpha, double *X); - -template -void OpenCLDevice::add_scalar(const int N, const Dtype alpha, - Dtype *X) { - NOT_IMPLEMENTED; + caffe_opencl_set(N, alpha, X); } -template -void OpenCLDevice::add_scalar(const int N, const float alpha, float *X); -template -void OpenCLDevice::add_scalar(const int N, const double alpha, - double *X); - - -template -void OpenCLDevice::powx(const int N, const Dtype* a, const Dtype b, - Dtype* y) { - NOT_IMPLEMENTED; -// caffe_gpu_powx(N, a, b, y); -} -template -void OpenCLDevice::powx(const int N, const float* a, const float b, - float *y); -template -void OpenCLDevice::powx(const int N, const double* a, - const double b, double *y); - - -template -void OpenCLDevice::rng_uniform(const int N, const Dtype a, - const Dtype b, Dtype* r) { - NOT_IMPLEMENTED; -// caffe_gpu_rng_uniform(N, a, b, r); -} - -template -void OpenCLDevice::rng_uniform( - const int N, const float a, const float b, float* r); -template -void OpenCLDevice::rng_uniform( - const int N, const double a, const double b, double* r); - -template -void OpenCLDevice::rng_gaussian(const int N, const Dtype mu, - const Dtype sigma, Dtype* r) { - NOT_IMPLEMENTED; -// caffe_gpu_rng_gaussian(N, mu, sigma, r); -} - -template -void OpenCLDevice::rng_gaussian( - const int N, const float mu, const float sigma, float* r); -template -void OpenCLDevice::rng_gaussian( - const int N, const double mu, const double sigma, double* r); +//template +//void OpenCLDevice::add_scalar(const int N, const Dtype alpha, +// Dtype *X) { +// NOT_IMPLEMENTED; +//} -template -void OpenCLDevice::rng_bernoulli(const int N, const Dtype p, int* r) { - NOT_IMPLEMENTED; -// caffe_gpu_rng_bernoulli(N, p, r); -} +//template +//void OpenCLDevice::powx(const int N, const Dtype* a, const Dtype b, +// Dtype* y) { +// NOT_IMPLEMENTED; +//// caffe_gpu_powx(N, a, b, y); +//} -template -void OpenCLDevice::rng_bernoulli(const int N, const float p, int* r); -template -void OpenCLDevice::rng_bernoulli(const int N, const double p, int* r); +//template +//void OpenCLDevice::rng_uniform(const int N, const Dtype a, +// const Dtype b, Dtype* r) { +// NOT_IMPLEMENTED; +//// caffe_gpu_rng_uniform(N, a, b, r); +//} -template -void OpenCLDevice::dot(const int N, const Dtype* x, const Dtype* y, - Dtype* out) { - NOT_IMPLEMENTED; -// caffe_gpu_dot(N, x, y, out); -} +//template +//void OpenCLDevice::rng_gaussian(const int N, const Dtype mu, +// const Dtype sigma, Dtype* r) { +// NOT_IMPLEMENTED; +//// caffe_gpu_rng_gaussian(N, mu, sigma, r); +//} -template -void OpenCLDevice::dot(const int N, const float* x, const float* y, - float* out); -template -void OpenCLDevice::dot(const int N, const double* x, const double* y, - double* out); +//template +//void OpenCLDevice::rng_bernoulli(const int N, const Dtype p, int* r) { +// NOT_IMPLEMENTED; +//// caffe_gpu_rng_bernoulli(N, p, r); +//} -template -void OpenCLDevice::hamming_distance(const int N, const Dtype* x, - const Dtype* y, uint32_t* out) { - NOT_IMPLEMENTED; -// *out = caffe_gpu_hamming_distance(N, x, y); -} +//template +//void OpenCLDevice::dot(const int N, const Dtype* x, const Dtype* y, +// Dtype* out) { +// NOT_IMPLEMENTED; +//// caffe_gpu_dot(N, x, y, out); +//} -template -void OpenCLDevice::hamming_distance(const int N, const float* x, - const float* y, uint32_t* out); -template -void OpenCLDevice::hamming_distance(const int N, const double* x, - const double* y, uint32_t* out); +//template +//void OpenCLDevice::hamming_distance(const int N, const Dtype* x, +// const Dtype* y, uint32_t* out) { +// NOT_IMPLEMENTED; +//// *out = caffe_gpu_hamming_distance(N, x, y); +//} /** * @@ -467,77 +241,46 @@ clblasSasum( const cl_event *eventWaitList, cl_event *events) */ -template -void OpenCLDevice::asum(const int N, const Dtype* x, Dtype* y) { - NOT_IMPLEMENTED; -// CREATE_CL_MEM(x, N, 1, READ_ONLY); -// CREATE_CL_MEM(y, N, 1, READ_WRITE); -// PRE_CLBLAS_CALL; -// CLBLAS_CHECK(clblasSasum( -// N, alpha, ARRAY(X), -// CLBALS_TRAILING_ARGS)); -} - -template -void OpenCLDevice::asum(const int N, const float* x, float* y); -template -void OpenCLDevice::asum(const int N, const double* x, double* y); - -template -void OpenCLDevice::scale(const int N, const Dtype alpha, - const Dtype *x, Dtype* y) { - this->copy(N, x, y); - this->scal(N, alpha, y); -} - -template -void OpenCLDevice::scale(const int N, const float alpha, - const float *x, float* y); -template -void OpenCLDevice::scale(const int N, const double alpha, - const double *x, double* y); - -template -void OpenCLDevice::im2col( - const Dtype* data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col) { +//template +//void OpenCLDevice::asum(const int N, const Dtype* x, Dtype* y) { // NOT_IMPLEMENTED; -// im2col_gpu(data_im, channels, height, width, ksize, pad, stride, -// data_col); -} - -template -void OpenCLDevice::im2col( - const float* data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, float* data_col); -template -void OpenCLDevice::im2col( - const double* data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, double* data_col); - -template -void OpenCLDevice::col2im( - const Dtype* data_col, const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, Dtype* data_im) { -// NOT_IMPLEMENTED; -// col2im_gpu(data_col, channels, height, width, psize, pad, stride, -// data_im); -} - -template -void OpenCLDevice::col2im( - const float* data_col, const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, float* data_im); -template -void OpenCLDevice::col2im( - const double* data_col, const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, double* data_im); +//// CREATE_CL_MEM(x, N, 1, READ_ONLY); +//// CREATE_CL_MEM(y, N, 1, READ_WRITE); +//// PRE_CLBLAS_CALL; +//// CLBLAS_CHECK(clblasSasum( +//// N, alpha, ARRAY(X), +//// CLBALS_TRAILING_ARGS)); +//} + +//template +//void OpenCLDevice::scale(const int N, const Dtype alpha, +// const Dtype *x, Dtype* y) { +// this->copy(N, x, y); +// this->scal(N, alpha, y); +//} + +//template +//void OpenCLDevice::im2col( +// const Dtype* data_im, const int channels, +// const int height, const int width, const int ksize, const int pad, +// const int stride, Dtype* data_col) { +//// NOT_IMPLEMENTED; +//// im2col_gpu(data_im, channels, height, width, ksize, pad, stride, +//// data_col); +//} + +//template +//void OpenCLDevice::col2im( +// const Dtype* data_col, const int channels, +// const int height, const int width, const int psize, const int pad, +// const int stride, Dtype* data_im) { +//// NOT_IMPLEMENTED; +//// col2im_gpu(data_col, channels, height, width, psize, pad, stride, +//// data_im); +//} + + +INSTANTIATE_CLASS(OpenCLDevice); const char* clGetErrorString(cl_int error) { switch (error) { diff --git a/src/caffe/util/opencl_math_functions.cpp b/src/caffe/util/opencl_math_functions.cpp new file mode 100644 index 00000000000..aca0568e52a --- /dev/null +++ b/src/caffe/util/opencl_math_functions.cpp @@ -0,0 +1,220 @@ +// Copyright 2014 BVLC and contributors. + +//#include "caffe/common.hpp" +#include "caffe/util/opencl_math_functions.hpp" + +namespace caffe { + +template <> +void caffe_opencl_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + int ldA = (TransA == CblasNoTrans) ? K : M; + int ldB = (TransB == CblasNoTrans) ? N : K; + int ldC = N; + clblasTranspose clTransA = to_clblasTranspose(TransA); + clblasTranspose clTransB = to_clblasTranspose(TransB); + CREATE_CL_MEM(A, M, K, READ_ONLY); + CREATE_CL_MEM(B, K, N, READ_ONLY); + CREATE_CL_MEM(C, M, N, READ_WRITE); + ENQUEUE_CL_BUFFER(Write, A, M, K); + ENQUEUE_CL_BUFFER(Write, B, K, N); + ENQUEUE_CL_BUFFER(Write, C, M, N); + PRE_CLBLAS_CALL; + // bufX is defined by the macro CREATE_CL_MEM(X, ...) + CLBLAS_CHECK(clblasSgemm(clblasRowMajor, clTransA, clTransB, + M, N, K, alpha, ARRAY(A), ARRAY(B), beta, ARRAY(C), + CLBALS_TRAILING_ARGS)); + /* Release OpenCL memory objects. */ + RELEASE_CL_MEM(C); + RELEASE_CL_MEM(B); + RELEASE_CL_MEM(A); +} + +template <> +void caffe_opencl_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { + int ldA = (TransA == CblasNoTrans) ? K : M; + int ldB = (TransB == CblasNoTrans) ? N : K; + int ldC = N; + clblasTranspose clTransA = to_clblasTranspose(TransA); + clblasTranspose clTransB = to_clblasTranspose(TransB); + CREATE_CL_MEM(A, M, K, READ_ONLY); + CREATE_CL_MEM(B, K, N, READ_ONLY); + CREATE_CL_MEM(C, M, N, READ_WRITE); + ENQUEUE_CL_BUFFER(Write, A, M, K); + ENQUEUE_CL_BUFFER(Write, B, K, N); + ENQUEUE_CL_BUFFER(Write, C, M, N); + PRE_CLBLAS_CALL; + // bufX is defined by the macro CREATE_CL_MEM(X, ...) + CLBLAS_CHECK(clblasDgemm(clblasRowMajor, clTransA, clTransB, + M, N, K, alpha, ARRAY(A), ARRAY(B), beta, ARRAY(C), + CLBALS_TRAILING_ARGS)); + /* Release OpenCL memory objects. */ + RELEASE_CL_MEM(C); + RELEASE_CL_MEM(B); + RELEASE_CL_MEM(A); +} + +template <> +void caffe_opencl_gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const float alpha, const float* A, const float* x, + const float beta, float* y) { + clblasTranspose clTransA = to_clblasTranspose(TransA); + int ldA = (TransA == CblasNoTrans) ? N : M; + int ldx = N; + int ldy = N; + CREATE_CL_MEM(A, M, N, READ_ONLY); + CREATE_CL_MEM(x, N, 1, READ_ONLY); + CREATE_CL_MEM(y, M, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasSgemv(clblasRowMajor, clTransA, M, N, alpha, + ARRAY(A), ARRAY(x), beta, ARRAY(y), + CLBALS_TRAILING_ARGS)); +} + +template <> +void caffe_opencl_gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const double alpha, const double* A, const double* x, + const double beta, double* y) { + clblasTranspose clTransA = to_clblasTranspose(TransA); + int ldA = (TransA == CblasNoTrans) ? N : M; + int ldx = N; + int ldy = N; + CREATE_CL_MEM(A, M, N, READ_ONLY); + CREATE_CL_MEM(x, N, 1, READ_ONLY); + CREATE_CL_MEM(y, M, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasDgemv(clblasRowMajor, clTransA, M, N, alpha, + ARRAY(A), ARRAY(x), beta, ARRAY(y), + CLBALS_TRAILING_ARGS)); +} + +template <> +void caffe_opencl_axpy(const int N, const float alpha, const float* X, + float* Y) { + int ldX = N; + int ldY = N; + CREATE_CL_MEM(X, N, 1, READ_ONLY); + CREATE_CL_MEM(Y, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasSaxpy( + N, alpha, ARRAY(X), ARRAY(Y), + CLBALS_TRAILING_ARGS)); +} + +template <> +void caffe_opencl_axpy(const int N, const double alpha, const double* X, + double* Y) { + int ldX = N; + int ldY = N; + CREATE_CL_MEM(X, N, 1, READ_ONLY); + CREATE_CL_MEM(Y, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasDaxpy( + N, alpha, ARRAY(X), ARRAY(Y), + CLBALS_TRAILING_ARGS)); +} + +template <> +void caffe_opencl_copy(const int N, const float* X, float* Y) { + int ldX = N; + int ldY = N; + CREATE_CL_MEM(X, N, 1, READ_ONLY); + CREATE_CL_MEM(Y, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasScopy( + N, ARRAY(X), ARRAY(Y), + CLBALS_TRAILING_ARGS)); +} + +template <> +void caffe_opencl_copy(const int N, const double* X, double* Y) { + int ldX = N; + int ldY = N; + CREATE_CL_MEM(X, N, 1, READ_ONLY); + CREATE_CL_MEM(Y, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasDcopy( + N, ARRAY(X), ARRAY(Y), + CLBALS_TRAILING_ARGS)); +} + +template <> +void caffe_opencl_scal(const int N, const float alpha, float *X) { + int ldX = N; + CREATE_CL_MEM(X, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasSscal( + N, alpha, ARRAY(X), + CLBALS_TRAILING_ARGS)); +} + +template <> +void caffe_opencl_scal(const int N, const double alpha, double *X) { + int ldX = N; + CREATE_CL_MEM(X, N, 1, READ_WRITE); + PRE_CLBLAS_CALL; + CLBLAS_CHECK(clblasDscal( + N, alpha, ARRAY(X), + CLBALS_TRAILING_ARGS)); +} + +template <> +void caffe_opencl_axpby(const int N, const float alpha, const float* X, + const float beta, float* Y) { + caffe_opencl_scal(N, beta, Y); + caffe_opencl_axpy(N, alpha, X, Y); +} + +template <> +void caffe_opencl_axpby(const int N, const double alpha, const double* X, + const double beta, double* Y) { + caffe_opencl_scal(N, beta, Y); + caffe_opencl_axpy(N, alpha, X, Y); +} + + +template +void caffe_opencl_copy_from_cpu(const int N, const Dtype *X, Dtype *Y) { + CREATE_CL_MEM(Y, N, 1, READ_WRITE); + cl_bool blocking_write = CL_TRUE; + cl_uint num_events_in_wait_list = 0; + cl_event *event_wait_list = NULL; + cl_event events = NULL; + CL_CHECK(clEnqueueWriteBuffer( + CaffeOpenCL::queue(), bufY, blocking_write, 0, N * sizeof(Dtype), + X, num_events_in_wait_list, event_wait_list, &events)); +} + +template +void caffe_opencl_copy_from_cpu(const int N, const float *X, float *Y); +template +void caffe_opencl_copy_from_cpu(const int N, const double *X, double *Y); + + +template +void caffe_opencl_set(const int N, const Dtype alpha, Dtype *X) { +#ifdef CL_VERSION_1_2 + CREATE_CL_MEM(X, N, 1, READ_WRITE); + cl_uint num_events_in_wait_list = 0; + cl_event *event_wait_list = NULL; + cl_event event = NULL; + CL_CHECK(clEnqueueFillBuffer( + CaffeOpenCL::queue(), bufX, static_cast(&alpha), sizeof(Dtype), + 0, sizeof(Dtype) * N, num_events_in_wait_list, event_wait_list, &event)); +#else + std::vector tmp(N, alpha); + caffe_opencl_copy_from_cpu(N, &tmp[0], X); +#endif +} + +template +void caffe_opencl_set(const int N, const float alpha, float *X); +template +void caffe_opencl_set(const int N, const double alpha, double *X); + +} // namespace caffe From 2294bc9891284b174ade74c443e12fdef1034a1b Mon Sep 17 00:00:00 2001 From: Kai Li Date: Thu, 19 Jun 2014 15:25:58 +0800 Subject: [PATCH 63/75] Implement the macros to define the OpenCL kernels for the math functions --- include/caffe/util/opencl_device.hpp | 6 - include/caffe/util/opencl_math_functions.hpp | 149 +++++++++++++++++-- src/caffe/util/opencl_math_functions.cpp | 33 ++-- 3 files changed, 155 insertions(+), 33 deletions(-) diff --git a/include/caffe/util/opencl_device.hpp b/include/caffe/util/opencl_device.hpp index 23b41a158d0..16d837e5234 100644 --- a/include/caffe/util/opencl_device.hpp +++ b/include/caffe/util/opencl_device.hpp @@ -35,12 +35,6 @@ namespace caffe { const char* clGetErrorString(cl_int error); const char* clblasGetErrorString(clblasStatus status); -// OpenCL: grid stride looping -#define OPENCL_KERNEL_LOOP(i, n) \ - for (int i = get_global_id(0); \ - i < (n); \ - i += get_global_size(0)) - class CaffeOpenCL { public: inline static CaffeOpenCL& Get() { diff --git a/include/caffe/util/opencl_math_functions.hpp b/include/caffe/util/opencl_math_functions.hpp index 27448d27c75..ad4c1fe4132 100644 --- a/include/caffe/util/opencl_math_functions.hpp +++ b/include/caffe/util/opencl_math_functions.hpp @@ -33,10 +33,127 @@ namespace caffe { #define ARRAY(A) buf##A, 0, ld##A -#define CLBALS_TRAILING_ARGS \ +#define CLBLAS_TRAILING_ARGS \ num_command_queues, &queue, num_events_in_wait_list, \ event_wait_list, &events +#define OPENCL_UNARY_KERNEL(Dtype_str, name_str, operation_str) \ +"template \n" \ +"__kernel void " name_str "( \n" \ +" __global " Dtype_str "* x, \n" \ +" __global " Dtype_str "* y, \n" \ +" const unsigned int count) { \n" \ +" for (int i = get_global_id(0); \n" \ +" i < (count); \n" \ +" i += get_global_size(0)) { \n" \ +" " operation_str "; \n" \ +"} \n" \ +"\n"; + +#define OPENCL_BINARY_KERNEL(Dtype_str, name_str, operation_str) \ +"__kernel void " name_str "( \n" \ +" __global " Dtype_str "* a, \n" \ +" __global " Dtype_str "* b, \n" \ +" __global " Dtype_str "* y, \n" \ +" const unsigned int count) { \n" \ +" for (int i = get_global_id(0); \n" \ +" i < (count); \n" \ +" i += get_global_size(0)) { \n" \ +" " operation_str "; \n" \ +"} \n" \ +"\n" + +// local_size: Number of work items in each local work group +// global_size: Number of total work items +#define DEFINE_LOCAL_AND_GLOBAL_SIZE(n) \ + const size_t local_size = 64; \ + const size_t global_size = (n + local_size - 1) \ + / local_size + + +// https://www.olcf.ornl.gov/tutorials/opencl-vector-addition/ +#define DEFINE_OPENCL_UNARY_FUNC(Dtype, name, operation) \ +template <> \ +void caffe_opencl_##name(const int n, const Dtype *x, Dtype *y) { \ + const char* kernel_source = OPENCL_UNARY_KERNEL(#Dtype, #name, \ + #operation); \ + cl_context context = CaffeOpenCL::context(); \ + cl_command_queue queue = CaffeOpenCL::queue(); \ + cl_int error; \ + cl_program program = clCreateProgramWithSource( \ + context, 1, (const char **) & kernel_source, NULL, &error); \ + CL_CHECK(error); \ + clBuildProgram(program, 0, NULL, NULL, NULL, NULL); \ + cl_kernel kernel = clCreateKernel(program, #name, &error); \ + CL_CHECK(error); \ + size_t bytes = n * sizeof(Dtype); \ + cl_mem d_x = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); \ + cl_mem d_y = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL); \ + CL_CHECK(clEnqueueWriteBuffer(queue, d_x, CL_TRUE, 0, \ + bytes, x, 0, NULL, NULL)); \ + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_x)); \ + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_y)); \ + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(unsigned int), &n)); \ + DEFINE_LOCAL_AND_GLOBAL_SIZE(n); \ + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, \ + &local_size, 0, NULL, NULL)); \ + CL_CHECK(clFinish(queue)); \ + CL_CHECK(clEnqueueReadBuffer(queue, d_y, CL_TRUE, 0, \ + bytes, y, 0, NULL, NULL )); \ + CL_CHECK(clReleaseMemObject(d_x)); \ + CL_CHECK(clReleaseMemObject(d_y)); \ + CL_CHECK(clReleaseProgram(program)); \ + CL_CHECK(clReleaseKernel(kernel)); \ +} + +#define DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(name, operation) \ + DEFINE_OPENCL_UNARY_FUNC(float, name, operation) \ + DEFINE_OPENCL_UNARY_FUNC(double, name, operation) \ + +#define DEFINE_OPENCL_BINARY_FUNC(Dtype, name, operation) \ +template <> \ +void caffe_opencl_##name(const int n, const Dtype *a, const Dtype *b, \ + Dtype *y) { \ + const char* kernel_source = OPENCL_BINARY_KERNEL(#Dtype, #name, \ + #operation); \ + cl_context context = CaffeOpenCL::context(); \ + cl_command_queue queue = CaffeOpenCL::queue(); \ + cl_int error; \ + cl_program program = clCreateProgramWithSource( \ + context, 1, (const char **) & kernel_source, NULL, &error); \ + CL_CHECK(error); \ + clBuildProgram(program, 0, NULL, NULL, NULL, NULL); \ + cl_kernel kernel = clCreateKernel(program, #name, &error); \ + CL_CHECK(error); \ + size_t bytes = n * sizeof(Dtype); \ + cl_mem d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); \ + cl_mem d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); \ + cl_mem d_y = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL); \ + CL_CHECK(clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, \ + bytes, a, 0, NULL, NULL)); \ + CL_CHECK(clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0, \ + bytes, b, 0, NULL, NULL)); \ + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a)); \ + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b)); \ + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_y)); \ + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(unsigned int), &n)); \ + DEFINE_LOCAL_AND_GLOBAL_SIZE(n); \ + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, \ + &local_size, 0, NULL, NULL)); \ + CL_CHECK(clFinish(queue)); \ + CL_CHECK(clEnqueueReadBuffer(queue, d_y, CL_TRUE, 0, \ + bytes, y, 0, NULL, NULL )); \ + CL_CHECK(clReleaseMemObject(d_a)); \ + CL_CHECK(clReleaseMemObject(d_b)); \ + CL_CHECK(clReleaseMemObject(d_y)); \ + CL_CHECK(clReleaseProgram(program)); \ + CL_CHECK(clReleaseKernel(kernel)); \ +} + +#define DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(name, operation) \ + DEFINE_OPENCL_BINARY_FUNC(float, name, operation) \ + DEFINE_OPENCL_BINARY_FUNC(double, name, operation) + inline clblasTranspose to_clblasTranspose(const CBLAS_TRANSPOSE trans) { switch (trans) { case CblasNoTrans: @@ -52,35 +169,35 @@ inline clblasTranspose to_clblasTranspose(const CBLAS_TRANSPOSE trans) { template void caffe_opencl_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const CBLAS_TRANSPOSE TransB, const int M, const int n, const int K, const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, Dtype* C); template -void caffe_opencl_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, +void caffe_opencl_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int n, const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, Dtype* y); template -void caffe_opencl_axpy(const int N, const Dtype alpha, const Dtype* X, - Dtype* Y); +void caffe_opencl_axpy(const int n, const Dtype alpha, const Dtype* x, + Dtype* y); template -void caffe_opencl_axpby(const int N, const Dtype alpha, const Dtype* X, - const Dtype beta, Dtype* Y); +void caffe_opencl_axpby(const int n, const Dtype alpha, const Dtype* x, + const Dtype beta, Dtype* y); template -void caffe_opencl_copy(const int N, const Dtype *X, Dtype *Y); +void caffe_opencl_copy(const int n, const Dtype *x, Dtype *y); template -void caffe_opencl_set(const int N, const Dtype alpha, Dtype *X); +void caffe_opencl_set(const int n, const Dtype alpha, Dtype *x); template -void caffe_opencl_add_scalar(const int N, const Dtype alpha, Dtype *X); +void caffe_opencl_add_scalar(const int n, const Dtype alpha, Dtype *x); template -void caffe_opencl_scal(const int N, const Dtype alpha, Dtype *X); +void caffe_opencl_scal(const int n, const Dtype alpha, Dtype *x); template Dtype caffe_opencl_dot(const int n, const Dtype* x, const Dtype* y); @@ -96,7 +213,7 @@ template void caffe_opencl_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); template -void caffe_opencl_copy_from_cpu(const int N, const Dtype *X, Dtype *Y); +void caffe_opencl_copy_from_cpu(const int n, const Dtype *x, Dtype *y); template void caffe_opencl_sqr(const int n, const Dtype* x, Dtype* y); @@ -114,19 +231,19 @@ template void caffe_opencl_fabs(const int n, const Dtype* x, Dtype* y); template -void caffe_opencl_add(const int N, const Dtype* a, +void caffe_opencl_add(const int n, const Dtype* a, const Dtype* b, Dtype* y); template -void caffe_opencl_sub(const int N, const Dtype* a, +void caffe_opencl_sub(const int n, const Dtype* a, const Dtype* b, Dtype* y); template -void caffe_opencl_mul(const int N, const Dtype* a, +void caffe_opencl_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y); template -void caffe_opencl_div(const int N, const Dtype* a, +void caffe_opencl_div(const int n, const Dtype* a, const Dtype* b, Dtype* y); } // namespace caffe diff --git a/src/caffe/util/opencl_math_functions.cpp b/src/caffe/util/opencl_math_functions.cpp index aca0568e52a..fecd2fca3b5 100644 --- a/src/caffe/util/opencl_math_functions.cpp +++ b/src/caffe/util/opencl_math_functions.cpp @@ -25,7 +25,7 @@ void caffe_opencl_gemm(const CBLAS_TRANSPOSE TransA, // bufX is defined by the macro CREATE_CL_MEM(X, ...) CLBLAS_CHECK(clblasSgemm(clblasRowMajor, clTransA, clTransB, M, N, K, alpha, ARRAY(A), ARRAY(B), beta, ARRAY(C), - CLBALS_TRAILING_ARGS)); + CLBLAS_TRAILING_ARGS)); /* Release OpenCL memory objects. */ RELEASE_CL_MEM(C); RELEASE_CL_MEM(B); @@ -52,7 +52,7 @@ void caffe_opencl_gemm(const CBLAS_TRANSPOSE TransA, // bufX is defined by the macro CREATE_CL_MEM(X, ...) CLBLAS_CHECK(clblasDgemm(clblasRowMajor, clTransA, clTransB, M, N, K, alpha, ARRAY(A), ARRAY(B), beta, ARRAY(C), - CLBALS_TRAILING_ARGS)); + CLBLAS_TRAILING_ARGS)); /* Release OpenCL memory objects. */ RELEASE_CL_MEM(C); RELEASE_CL_MEM(B); @@ -73,7 +73,7 @@ void caffe_opencl_gemv(const CBLAS_TRANSPOSE TransA, const int M, PRE_CLBLAS_CALL; CLBLAS_CHECK(clblasSgemv(clblasRowMajor, clTransA, M, N, alpha, ARRAY(A), ARRAY(x), beta, ARRAY(y), - CLBALS_TRAILING_ARGS)); + CLBLAS_TRAILING_ARGS)); } template <> @@ -90,7 +90,7 @@ void caffe_opencl_gemv(const CBLAS_TRANSPOSE TransA, const int M, PRE_CLBLAS_CALL; CLBLAS_CHECK(clblasDgemv(clblasRowMajor, clTransA, M, N, alpha, ARRAY(A), ARRAY(x), beta, ARRAY(y), - CLBALS_TRAILING_ARGS)); + CLBLAS_TRAILING_ARGS)); } template <> @@ -103,7 +103,7 @@ void caffe_opencl_axpy(const int N, const float alpha, const float* X, PRE_CLBLAS_CALL; CLBLAS_CHECK(clblasSaxpy( N, alpha, ARRAY(X), ARRAY(Y), - CLBALS_TRAILING_ARGS)); + CLBLAS_TRAILING_ARGS)); } template <> @@ -116,7 +116,7 @@ void caffe_opencl_axpy(const int N, const double alpha, const double* X, PRE_CLBLAS_CALL; CLBLAS_CHECK(clblasDaxpy( N, alpha, ARRAY(X), ARRAY(Y), - CLBALS_TRAILING_ARGS)); + CLBLAS_TRAILING_ARGS)); } template <> @@ -128,7 +128,7 @@ void caffe_opencl_copy(const int N, const float* X, float* Y) { PRE_CLBLAS_CALL; CLBLAS_CHECK(clblasScopy( N, ARRAY(X), ARRAY(Y), - CLBALS_TRAILING_ARGS)); + CLBLAS_TRAILING_ARGS)); } template <> @@ -140,7 +140,7 @@ void caffe_opencl_copy(const int N, const double* X, double* Y) { PRE_CLBLAS_CALL; CLBLAS_CHECK(clblasDcopy( N, ARRAY(X), ARRAY(Y), - CLBALS_TRAILING_ARGS)); + CLBLAS_TRAILING_ARGS)); } template <> @@ -150,7 +150,7 @@ void caffe_opencl_scal(const int N, const float alpha, float *X) { PRE_CLBLAS_CALL; CLBLAS_CHECK(clblasSscal( N, alpha, ARRAY(X), - CLBALS_TRAILING_ARGS)); + CLBLAS_TRAILING_ARGS)); } template <> @@ -160,7 +160,7 @@ void caffe_opencl_scal(const int N, const double alpha, double *X) { PRE_CLBLAS_CALL; CLBLAS_CHECK(clblasDscal( N, alpha, ARRAY(X), - CLBALS_TRAILING_ARGS)); + CLBLAS_TRAILING_ARGS)); } template <> @@ -177,7 +177,6 @@ void caffe_opencl_axpby(const int N, const double alpha, const double* X caffe_opencl_axpy(N, alpha, X, Y); } - template void caffe_opencl_copy_from_cpu(const int N, const Dtype *X, Dtype *Y) { CREATE_CL_MEM(Y, N, 1, READ_WRITE); @@ -217,4 +216,16 @@ void caffe_opencl_set(const int N, const float alpha, float *X); template void caffe_opencl_set(const int N, const double alpha, double *X); + +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sqr, y[i] = x[i] * x[i]); +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(exp, y[i] = exp(x[i])); +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sign, y[i] = sign(x[i])); +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sgnbit, y[i] = signbit(x[i])); +DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(fabs, y[i] = fabs(x[i])); + +DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(add, y[i] = a[i] + b[i]); +DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(sub, y[i] = a[i] - b[i]); +DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(mul, y[i] = a[i] * b[i]); +DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(div, y[i] = a[i] / b[i]); + } // namespace caffe From be9549ae22e245b45220376890fd84f051cc8653 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Thu, 19 Jun 2014 15:25:58 +0800 Subject: [PATCH 64/75] Implement the macros to define the OpenCL kernels for the math functions --- src/caffe/util/opencl_device.cl | 86 --------------------------------- 1 file changed, 86 deletions(-) delete mode 100644 src/caffe/util/opencl_device.cl diff --git a/src/caffe/util/opencl_device.cl b/src/caffe/util/opencl_device.cl deleted file mode 100644 index 77e4a5e8778..00000000000 --- a/src/caffe/util/opencl_device.cl +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include "caffe/common.hpp" -#include "caffe/util/opencl_device.hpp" - -namespace caffe { - - -#define DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(name, operation) \ -template \ -__kernel void name##_kernel(const int n, const Dtype* x, Dtype* y) { \ - OPENCL_KERNEL_LOOP(index, n) { \ - operation; \ - } \ -} \ -template <> \ -void caffe_opencl_##name(const int n, const float* x, float* y) { \ - /* NOLINT_NEXT_LINE(whitespace/operators) */ \ - name##_kernel<<>>( \ - n, x, y); \ -} \ -template <> \ -void caffe_opencl_##name(const int n, const double* x, double* y) { \ - /* NOLINT_NEXT_LINE(whitespace/operators) */ \ - name##_kernel<<>>( \ - n, x, y); \ -} \ -template \ -void OpenCLDevice::name(const int N, const Dtype* x, Dtype* y) { \ - caffe_opencl_##name(N, x, y); \ -} \ -template \ -void OpenCLDevice::name(const int N, const float* x, float* y); \ -template \ -void OpenCLDevice::name(const int N, const double* x, double* y); - - -#define DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(name, operation) \ -template \ -__kernel void name##_kernel(__global const int n, __global const Dtype* a, \ - __global const Dtype* b, __global Dtype* y) { \ - OPENCL_KERNEL_LOOP(i, n) { \ - operation; \ - } \ -} \ -template <> \ -void caffe_opencl_##name( \ - __global const int N, __global const float* a, \ - __global const float* b, __global float* y) { \ - /* NOLINT_NEXT_LINE(whitespace/operators) */ \ - name##_kernel<<>>( \ - N, a, b, y); \ -} \ -template <> \ -void caffe_opencl_##name( \ - __global const int N, __global const double* a, \ - __global const double* b, __global double* y) { \ - /* NOLINT_NEXT_LINE(whitespace/operators) */ \ - name##_kernel<<>>( \ - N, a, b, y); \ -} \ -template \ -void OpenCLDevice::name(const int N, const Dtype* a, const Dtype* b, \ - Dtype* y) { \ - caffe_opencl_##name(N, x, y); \ -} \ -template \ -void OpenCLDevice::name(const int N, const float* a, const float* b, \ - float* y); \ -template \ -void OpenCLDevice::name(const int N, const double* a, \ - const double* b, double* y); - - -DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sqr, y[i] = x[i] * x[i]); -DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(exp, y[i] = exp(x[i])); -DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sign, y[i] = sign(x[i])); -DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(sgnbit, y[i] = signbit(x[i])); -DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(fabs, y[i] = fabs(x[i])); - -DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(add, y[i] = a[i] + b[i]); -DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(sub, y[i] = a[i] - b[i]); -DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(mul, y[i] = a[i] * b[i]); -DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(div, y[i] = a[i] / b[i]); - -} // namespace caffe From 1dfc70ff7908b3719409f796797cf787148993c6 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Thu, 19 Jun 2014 19:16:02 +0800 Subject: [PATCH 65/75] Using Shared Context for Multiple OpenCL Devices https://software.intel.com/sites/products/documentation/ioclsdk/2013/OG/Using_Shared_Context_for_Multiple_OpenCL_Devices.htm --- include/caffe/common.hpp | 2 +- src/caffe/util/opencl_device.cpp | 25 ++++++++++++------------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index 52043838bf0..e25ad38d022 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -74,7 +74,7 @@ class Caffe { } return *singleton_; } - enum Brew { CPU, GPU, OPENCL_CPU, OPENCL_GPU }; + enum Brew { CPU, GPU, OPENCL_CPU, OPENCL_GPU, OPENCL_ALL }; enum Phase { TRAIN, TEST }; diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index 67b8326a5dc..8ca9861e111 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -16,9 +16,9 @@ cl_device_type CaffeOpenCL::get_device_type() { return CL_DEVICE_TYPE_CPU; case Caffe::OPENCL_GPU: return CL_DEVICE_TYPE_GPU; + case Caffe::OPENCL_ALL: default: - LOG(FATAL) << "Unknown Caffe OpenCL mode."; - return CL_DEVICE_TYPE_DEFAULT; + return CL_DEVICE_TYPE_ALL; } } @@ -33,7 +33,7 @@ void CaffeOpenCL::create_context() { malloc(sizeof(cl_platform_id) * platformCount); CL_CHECK(clGetPlatformIDs(1, platforms, NULL)); - cl_uint deviceCount; + cl_uint device_count; cl_device_type device_type = get_device_type(); int num_devices_to_skip = current_device_id_; while (num_devices_to_skip >= 0) { @@ -42,27 +42,26 @@ void CaffeOpenCL::create_context() { CL_CONTEXT_PLATFORM, (cl_context_properties)( platforms[i]), 0}; // get all devices - clGetDeviceIDs(platforms[i], device_type, 0, NULL, &deviceCount); - if (num_devices_to_skip <= deviceCount) { + clGetDeviceIDs(platforms[i], device_type, 0, NULL, &device_count); + if (num_devices_to_skip <= device_count) { current_cl_platform_id_ = platforms[i]; - current_platform_device_count_ = deviceCount; + current_platform_device_count_ = device_count; current_platform_device_id_ = num_devices_to_skip; - current_platform_device_ids_.resize(deviceCount); + current_platform_device_ids_.resize(device_count); CL_CHECK(clGetDeviceIDs(current_cl_platform_id_, device_type, current_platform_device_count_, &(current_platform_device_ids_[0]), NULL)); cl_int error = CL_SUCCESS; // Used to handle error codes - // TODO: clCreateContext or clCreateContextFromType? /* * http://dhruba.name/2012/10/14/opencl-cookbook-how-to-leverage-multiple-devices-in-opencl/ + * https://software.intel.com/sites/products/documentation/ioclsdk/2013/OG/Using_Shared_Context_for_Multiple_OpenCL_Devices.htm */ -// cl_context_ = clCreateContext(properties, deviceCount, devices, NULL, -// NULL, &error); - cl_context_ = clCreateContextFromType(properties, device_type, NULL, - NULL, &error); + cl_context_ = clCreateContext( + properties, device_count, &(current_platform_device_ids_[0]), + NULL, NULL, &error); CL_CHECK(error); } - num_devices_to_skip -= deviceCount; + num_devices_to_skip -= device_count; if (num_devices_to_skip < 0) { break; } From 11d7d967ed42a7f0a7ae39edff74e89d7f30c9d5 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Thu, 19 Jun 2014 22:15:38 +0800 Subject: [PATCH 66/75] Implement OpenCLSyncedMemory keeping the API of SyncedMemory --- include/caffe/opencl_syncedmem.hpp | 85 +++++++++++++++++++ src/caffe/opencl_syncedmem.cpp | 132 +++++++++++++++++++++++++++++ 2 files changed, 217 insertions(+) create mode 100644 include/caffe/opencl_syncedmem.hpp create mode 100644 src/caffe/opencl_syncedmem.cpp diff --git a/include/caffe/opencl_syncedmem.hpp b/include/caffe/opencl_syncedmem.hpp new file mode 100644 index 00000000000..c26b5fc89b4 --- /dev/null +++ b/include/caffe/opencl_syncedmem.hpp @@ -0,0 +1,85 @@ +// Copyright 2014 BVLC and contributors. +//#ifdef USE_OPENCL +#ifndef CAFFE_OPENCL_SYNCEDMEM_HPP_ +#define CAFFE_OPENCL_SYNCEDMEM_HPP_ + +#include +#ifdef __APPLE__ +#include +#else +#include +#endif + +#include "caffe/common.hpp" +#include "caffe/syncedmem.hpp" +#include "caffe/util/opencl_device.hpp" + +namespace caffe { + + +/* + * https://software.intel.com/sites/products/documentation/ioclsdk/2013/OG/Mapping_Memory_Objects_(USE_HOST_PTR).htm + * For efficiency reasons such a host-side pointer must be allocated for the + * conditions: + * * The amount of memory you allocate and the size of the corresponding + * * OpenCL* buffer must be multiple of the cache line sizes (64 bytes). + * * Always use 4k alignment (page alignment) when you allocate the host memory + * * for sharing with OpenCL devices. + */ +#define OPENCL_CACHE_LINE_SIZE 64 +#define OPENCL_PAGE_ALIGNMENT 4096 + +inline void opencl_aligned_malloc(void** ptr, size_t* size) { + *size += (*size % OPENCL_CACHE_LINE_SIZE); +#ifdef _MSC_VER + *ptr = _aligned_malloc(*size, OPENCL_PAGE_ALIGNMENT); +#else + if(posix_memalign(ptr, OPENCL_PAGE_ALIGNMENT, *size)) { + *ptr = NULL; + } +#endif +} + +inline void opencl_aligned_free(void* ptr) { +#ifdef _MSC_VER + _aligned_free(ptr); +#else + free(ptr); +#endif +} + +class OpenCLSyncedMemory { + public: + OpenCLSyncedMemory() + : shared_host_ptr_(NULL), mapped_device_ptr_(NULL), size_(0), head_(UNINITIALIZED), + own_cpu_data_(false) {} + explicit OpenCLSyncedMemory(size_t size) + : shared_host_ptr_(NULL), mapped_device_ptr_(NULL), size_(size), head_(UNINITIALIZED), + own_cpu_data_(false) {} + ~OpenCLSyncedMemory(); + const void* cpu_data(); + void set_cpu_data(void* data); + const void* gpu_data(); + void* mutable_cpu_data(); + void* mutable_gpu_data(); + enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED }; + SyncedHead head() { return head_; } + size_t size() { return size_; } + + private: + void to_cpu(); + void to_gpu(); + void* shared_host_ptr_; + void* mapped_device_ptr_; + cl_mem device_mem_; + size_t size_; + SyncedHead head_; + bool own_cpu_data_; + + DISABLE_COPY_AND_ASSIGN(OpenCLSyncedMemory); +}; // class OpenCLSyncedMemory + +} // namespace caffe + +#endif // CAFFE_OPENCL_SYNCEDMEM_HPP_ +//#endif // USE_OPENCL diff --git a/src/caffe/opencl_syncedmem.cpp b/src/caffe/opencl_syncedmem.cpp new file mode 100644 index 00000000000..a082f17ba10 --- /dev/null +++ b/src/caffe/opencl_syncedmem.cpp @@ -0,0 +1,132 @@ +// Copyright 2014 BVLC and contributors. +//#ifdef USE_OPENCL +#include + +#include "caffe/common.hpp" +#include "caffe/opencl_syncedmem.hpp" + +namespace caffe { + +OpenCLSyncedMemory::~OpenCLSyncedMemory() { + if (shared_host_ptr_ && own_cpu_data_) { + opencl_aligned_free(shared_host_ptr_); + shared_host_ptr_ = NULL; + } + + if (mapped_device_ptr_) { + CL_CHECK(clReleaseMemObject(device_mem_)); + free(mapped_device_ptr_); + mapped_device_ptr_ = NULL; + } +} + +inline void OpenCLSyncedMemory::to_cpu() { + switch (head_) { + case UNINITIALIZED: + opencl_aligned_malloc(&shared_host_ptr_, &size_); + memset(shared_host_ptr_, 0, size_); + head_ = HEAD_AT_CPU; + own_cpu_data_ = true; + break; + case HEAD_AT_GPU: + if (shared_host_ptr_ == NULL) { + opencl_aligned_malloc(&shared_host_ptr_, &size_); + own_cpu_data_ = true; + } + CL_CHECK(clEnqueueReadBuffer( + CaffeOpenCL::queue(), device_mem_, CL_TRUE, 0, + size_, shared_host_ptr_, 0, NULL, NULL)); + head_ = SYNCED; + break; + case HEAD_AT_CPU: + case SYNCED: + break; + } +} + +inline void OpenCLSyncedMemory::to_gpu() { + switch (head_) { + case UNINITIALIZED: +/* + * http://streamcomputing.eu/blog/2013-02-03/opencl-basics-flags-for-the-creating-memory-objects/ + */ + opencl_aligned_malloc(&shared_host_ptr_, &size_); + cl_int error; + device_mem_ = clCreateBuffer( + CaffeOpenCL::context(), CL_MEM_USE_HOST_PTR, + size_, shared_host_ptr_, &error); + CL_CHECK(error); + head_ = HEAD_AT_GPU; + break; + case HEAD_AT_CPU: + if (mapped_device_ptr_ == NULL) { + cl_int error; + device_mem_ = clCreateBuffer( + CaffeOpenCL::context(), CL_MEM_USE_HOST_PTR, + size_, shared_host_ptr_, &error); + CL_CHECK(error); + mapped_device_ptr_ = clEnqueueMapBuffer( + CaffeOpenCL::queue(), device_mem_, CL_TRUE, + CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, &error); + CL_CHECK(error); + } + CL_CHECK(clEnqueueWriteBuffer( + CaffeOpenCL::queue(), device_mem_, CL_TRUE, 0, + size_, shared_host_ptr_, 0, NULL, NULL)); + head_ = SYNCED; + break; + case HEAD_AT_GPU: + case SYNCED: + break; + } +} + +const void* OpenCLSyncedMemory::cpu_data() { + to_cpu(); + return (const void*)shared_host_ptr_; +} + +void OpenCLSyncedMemory::set_cpu_data(void* data) { + CHECK(data); + if (own_cpu_data_) { + CaffeFreeHost(shared_host_ptr_); + } + shared_host_ptr_ = data; + head_ = HEAD_AT_CPU; + own_cpu_data_ = false; +} + +const void* OpenCLSyncedMemory::gpu_data() { + to_gpu(); + cl_int error; + mapped_device_ptr_ = clEnqueueMapBuffer( + CaffeOpenCL::queue(), device_mem_, CL_TRUE, + CL_MAP_WRITE, 0, size_, 0, NULL, NULL, &error); + CL_CHECK(error); + CL_CHECK(clEnqueueUnmapMemObject( + CaffeOpenCL::queue(), device_mem_, mapped_device_ptr_, + 0, NULL, NULL)); + return (const void*)mapped_device_ptr_; +} + +void* OpenCLSyncedMemory::mutable_cpu_data() { + to_cpu(); + return shared_host_ptr_; +} + +void* OpenCLSyncedMemory::mutable_gpu_data() { + to_gpu(); + cl_int error; + mapped_device_ptr_ = clEnqueueMapBuffer( + CaffeOpenCL::queue(), device_mem_, CL_TRUE, + CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, &error); + CL_CHECK(error); + CL_CHECK(clEnqueueUnmapMemObject( + CaffeOpenCL::queue(), device_mem_, mapped_device_ptr_, + 0, NULL, NULL)); + return mapped_device_ptr_; +} + + +} // namespace caffe +//#endif // USE_OPENCL From 4694626d48977cbcea6a3ef645ddab9c15e1f476 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Thu, 19 Jun 2014 22:19:12 +0800 Subject: [PATCH 67/75] Replace clEnqueue{Read,Write}Buffer with MapBuffer/UnmapMemObject --- include/caffe/util/opencl_math_functions.hpp | 61 ++++++++++++++++---- 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/include/caffe/util/opencl_math_functions.hpp b/include/caffe/util/opencl_math_functions.hpp index ad4c1fe4132..434cb88ef3f 100644 --- a/include/caffe/util/opencl_math_functions.hpp +++ b/include/caffe/util/opencl_math_functions.hpp @@ -80,17 +80,27 @@ void caffe_opencl_##name(const int n, const Dtype *x, Dtype *y) { \ cl_context context = CaffeOpenCL::context(); \ cl_command_queue queue = CaffeOpenCL::queue(); \ cl_int error; \ + const size_t bytes = n * sizeof(Dtype); \ cl_program program = clCreateProgramWithSource( \ context, 1, (const char **) & kernel_source, NULL, &error); \ CL_CHECK(error); \ clBuildProgram(program, 0, NULL, NULL, NULL, NULL); \ cl_kernel kernel = clCreateKernel(program, #name, &error); \ CL_CHECK(error); \ - size_t bytes = n * sizeof(Dtype); \ - cl_mem d_x = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); \ - cl_mem d_y = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL); \ - CL_CHECK(clEnqueueWriteBuffer(queue, d_x, CL_TRUE, 0, \ - bytes, x, 0, NULL, NULL)); \ + cl_mem d_x = clCreateBuffer(context, \ + CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, \ + bytes, \ + const_cast(static_cast(x)), \ + &error); \ + cl_mem d_y = clCreateBuffer(context, \ + CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, \ + bytes, static_cast(y), &error); \ + void* mapped_x = clEnqueueMapBuffer( \ + queue, d_x, CL_TRUE, CL_MAP_READ, 0, bytes, 0, NULL, NULL, &error); \ + CL_CHECK(error); \ + CL_CHECK(clEnqueueUnmapMemObject( \ + CaffeOpenCL::queue(), d_x, mapped_x, \ + 0, NULL, NULL)); \ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_x)); \ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_y)); \ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(unsigned int), &n)); \ @@ -100,6 +110,12 @@ void caffe_opencl_##name(const int n, const Dtype *x, Dtype *y) { \ CL_CHECK(clFinish(queue)); \ CL_CHECK(clEnqueueReadBuffer(queue, d_y, CL_TRUE, 0, \ bytes, y, 0, NULL, NULL )); \ + void* mapped_y = clEnqueueMapBuffer( \ + queue, d_y, CL_TRUE, CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &error); \ + CL_CHECK(error); \ + CL_CHECK(clEnqueueUnmapMemObject( \ + CaffeOpenCL::queue(), d_y, mapped_y, \ + 0, NULL, NULL)); \ CL_CHECK(clReleaseMemObject(d_x)); \ CL_CHECK(clReleaseMemObject(d_y)); \ CL_CHECK(clReleaseProgram(program)); \ @@ -108,7 +124,7 @@ void caffe_opencl_##name(const int n, const Dtype *x, Dtype *y) { \ #define DEFINE_AND_INSTANTIATE_OPENCL_UNARY_FUNC(name, operation) \ DEFINE_OPENCL_UNARY_FUNC(float, name, operation) \ - DEFINE_OPENCL_UNARY_FUNC(double, name, operation) \ + DEFINE_OPENCL_UNARY_FUNC(double, name, operation) #define DEFINE_OPENCL_BINARY_FUNC(Dtype, name, operation) \ template <> \ @@ -119,16 +135,33 @@ void caffe_opencl_##name(const int n, const Dtype *a, const Dtype *b, \ cl_context context = CaffeOpenCL::context(); \ cl_command_queue queue = CaffeOpenCL::queue(); \ cl_int error; \ + const size_t bytes = n * sizeof(Dtype); \ cl_program program = clCreateProgramWithSource( \ context, 1, (const char **) & kernel_source, NULL, &error); \ CL_CHECK(error); \ clBuildProgram(program, 0, NULL, NULL, NULL, NULL); \ cl_kernel kernel = clCreateKernel(program, #name, &error); \ CL_CHECK(error); \ - size_t bytes = n * sizeof(Dtype); \ - cl_mem d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); \ - cl_mem d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); \ - cl_mem d_y = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL); \ + cl_mem d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, \ + const_cast(static_cast(a)),\ + &error); \ + cl_mem d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, \ + const_cast(static_cast(b)), \ + &error); \ + cl_mem d_y = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, \ + static_cast(y), &error); \ + void* mapped_a = clEnqueueMapBuffer( \ + queue, d_a, CL_TRUE, CL_MAP_READ, 0, bytes, 0, NULL, NULL, &error); \ + CL_CHECK(error); \ + CL_CHECK(clEnqueueUnmapMemObject( \ + CaffeOpenCL::queue(), d_a, mapped_a, \ + 0, NULL, NULL)); \ + void* mapped_b = clEnqueueMapBuffer( \ + queue, d_b, CL_TRUE, CL_MAP_READ, 0, bytes, 0, NULL, NULL, &error); \ + CL_CHECK(error); \ + CL_CHECK(clEnqueueUnmapMemObject( \ + CaffeOpenCL::queue(), d_b, mapped_b, \ + 0, NULL, NULL)); \ CL_CHECK(clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, \ bytes, a, 0, NULL, NULL)); \ CL_CHECK(clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0, \ @@ -141,8 +174,12 @@ void caffe_opencl_##name(const int n, const Dtype *a, const Dtype *b, \ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, \ &local_size, 0, NULL, NULL)); \ CL_CHECK(clFinish(queue)); \ - CL_CHECK(clEnqueueReadBuffer(queue, d_y, CL_TRUE, 0, \ - bytes, y, 0, NULL, NULL )); \ + void* mapped_y = clEnqueueMapBuffer( \ + queue, d_y, CL_TRUE, CL_MAP_WRITE, 0, bytes, 0, NULL, NULL, &error); \ + CL_CHECK(error); \ + CL_CHECK(clEnqueueUnmapMemObject( \ + CaffeOpenCL::queue(), d_y, mapped_y, \ + 0, NULL, NULL)); \ CL_CHECK(clReleaseMemObject(d_a)); \ CL_CHECK(clReleaseMemObject(d_b)); \ CL_CHECK(clReleaseMemObject(d_y)); \ From 5688b2c115cc6a0e1aaa6f4a74bef8da9c3fd899 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Thu, 19 Jun 2014 22:19:34 +0800 Subject: [PATCH 68/75] Add tests for OpenCL math functions --- src/caffe/test/test_opencl_math_functions.cpp | 268 ++++++++++++++++++ 1 file changed, 268 insertions(+) create mode 100644 src/caffe/test/test_opencl_math_functions.cpp diff --git a/src/caffe/test/test_opencl_math_functions.cpp b/src/caffe/test/test_opencl_math_functions.cpp new file mode 100644 index 00000000000..4ff25d82711 --- /dev/null +++ b/src/caffe/test/test_opencl_math_functions.cpp @@ -0,0 +1,268 @@ +// Copyright 2014 BVLC and contributors. + +#include // for uint32_t & uint64_t +#include +#include +#include // for std::fabs +#include // for rand_r + +#include "gtest/gtest.h" +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/util/opencl_math_functions.hpp" + +#include "caffe/test/test_caffe_main.hpp" + +namespace caffe { + +template +class OpenCLMathFunctionsTest : public ::testing::Test { + protected: + OpenCLMathFunctionsTest() + : blob_bottom_(new Blob()), + blob_bottom2_(new Blob()), + blob_top_(new Blob()) { + } + + virtual void SetUp() { + Caffe::set_random_seed(1701); + this->blob_bottom_->Reshape(11, 17, 19, 23); + this->blob_bottom2_->Reshape(11, 17, 19, 23); + this->blob_top_->Reshape(11, 17, 19, 23); + // fill the values + FillerParameter filler_param; + GaussianFiller filler(filler_param); + filler.Fill(this->blob_bottom_); + filler.Fill(this->blob_bottom2_); + filler.Fill(this->blob_top_); + } + + virtual ~OpenCLMathFunctionsTest() { + delete blob_bottom_; + delete blob_bottom2_; + delete blob_top_; + } + + Blob* const blob_bottom_; + Blob* const blob_bottom2_; + Blob* const blob_top_; +}; + +typedef ::testing::Types Dtypes; +TYPED_TEST_CASE(OpenCLMathFunctionsTest, Dtypes); + +TYPED_TEST(OpenCLMathFunctionsTest, TestNothing) { + // The first test case of a test suite takes the longest time + // due to the set up overhead. +} + +// TODO: Fix caffe_opencl_hamming_distance and re-enable this test. +TYPED_TEST(OpenCLMathFunctionsTest, DISABLED_TestHammingDistanceOpenCL) { + int n = this->blob_bottom_->count(); + const TypeParam* x = this->blob_bottom_->cpu_data(); + const TypeParam* y = this->blob_top_->cpu_data(); + int reference_distance = this->ReferenceHammingDistance(n, x, y); + x = this->blob_bottom_->opencl_data(); + y = this->blob_top_->opencl_data(); + int computed_distance = caffe_opencl_hamming_distance(n, x, y); + EXPECT_EQ(reference_distance, computed_distance); +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestAsumOpenCL) { + int n = this->blob_bottom_->count(); + const TypeParam* x = this->blob_bottom_->cpu_data(); + TypeParam std_asum = 0; + for (int i = 0; i < n; ++i) { + std_asum += std::fabs(x[i]); + } + TypeParam opencl_asum; + caffe_opencl_asum(n, this->blob_bottom_->opencl_data(), &opencl_asum); + EXPECT_LT((opencl_asum - std_asum) / std_asum, 1e-2); +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestSignOpenCL) { + int n = this->blob_bottom_->count(); + caffe_opencl_sign(n, this->blob_bottom_->opencl_data(), + this->blob_bottom_->mutable_opencl_diff()); + const TypeParam* signs = this->blob_bottom_->cpu_diff(); + const TypeParam* x = this->blob_bottom_->cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(signs[i], x[i] > 0 ? 1 : (x[i] < 0 ? -1 : 0)); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestSgnbitOpenCL) { + int n = this->blob_bottom_->count(); + caffe_opencl_sgnbit(n, this->blob_bottom_->opencl_data(), + this->blob_bottom_->mutable_opencl_diff()); + const TypeParam* signbits = this->blob_bottom_->cpu_diff(); + const TypeParam* x = this->blob_bottom_->cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(signbits[i], x[i] < 0 ? 1 : 0); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestFabsOpenCL) { + int n = this->blob_bottom_->count(); + caffe_opencl_fabs(n, this->blob_bottom_->opencl_data(), + this->blob_bottom_->mutable_opencl_diff()); + const TypeParam* abs_val = this->blob_bottom_->cpu_diff(); + const TypeParam* x = this->blob_bottom_->cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(abs_val[i], x[i] > 0 ? x[i] : -x[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestScaleOpenCL) { + int n = this->blob_bottom_->count(); + TypeParam alpha = this->blob_bottom_->cpu_diff()[caffe_rng_rand() % + this->blob_bottom_->count()]; + caffe_opencl_scale(n, alpha, this->blob_bottom_->opencl_data(), + this->blob_bottom_->mutable_opencl_diff()); + const TypeParam* scaled = this->blob_bottom_->cpu_diff(); + const TypeParam* x = this->blob_bottom_->cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(scaled[i], x[i] * alpha); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestCopyFromCPU) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->cpu_data(); + TypeParam* top_data = this->blob_top_->mutable_cpu_data(); + caffe_opencl_copy_from_cpu(n, bottom_data, top_data); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestCopyOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_copy(n, bottom_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestSqrOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_sqr(n, bottom_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestExpOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_exp(n, bottom_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestSignOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_sign(n, bottom_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestSgnbitOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_sgnbit(n, bottom_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestFabsOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_fabs(n, bottom_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestAddOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + const TypeParam* bottom2_data = this->blob_bottom2_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_add(n, bottom_data, bottom2_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + bottom2_data = this->blob_bottom2_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i] + bottom2_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestSubOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + const TypeParam* bottom2_data = this->blob_bottom2_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_sub(n, bottom_data, bottom2_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + bottom2_data = this->blob_bottom2_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i] - bottom2_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestMulOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + const TypeParam* bottom2_data = this->blob_bottom2_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_mul(n, bottom_data, bottom2_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + bottom2_data = this->blob_bottom2_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i] * bottom2_data[i], top_data[i]); + } +} + +TYPED_TEST(OpenCLMathFunctionsTest, TestDivOpenCL) { + const int n = this->blob_bottom_->count(); + const TypeParam* bottom_data = this->blob_bottom_->opencl_data(); + const TypeParam* bottom2_data = this->blob_bottom2_->opencl_data(); + TypeParam* top_data = this->blob_top_->mutable_opencl_data(); + caffe_opencl_div(n, bottom_data, bottom2_data, top_data); + bottom_data = this->blob_bottom_->cpu_data(); + bottom2_data = this->blob_bottom2_->cpu_data(); + top_data = this->blob_top_->mutable_cpu_data(); + for (int i = 0; i < n; ++i) { + EXPECT_EQ(bottom_data[i] / std::min(bottom2_data[i], 1e-5), top_data[i]); + } +} + +} // namespace caffe From 1fe92d0a3ab98c707108a5e38c4de410fdc494d1 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Thu, 19 Jun 2014 22:39:41 +0800 Subject: [PATCH 69/75] Add common abstract base class for SyncedMemory and OpenCLSyncedMemory --- include/caffe/opencl_syncedmem.hpp | 21 ++++--------- include/caffe/syncedmem.hpp | 45 +++++++++++++++++++--------- src/caffe/opencl_syncedmem.cpp | 48 +++++++++++++++--------------- 3 files changed, 61 insertions(+), 53 deletions(-) diff --git a/include/caffe/opencl_syncedmem.hpp b/include/caffe/opencl_syncedmem.hpp index c26b5fc89b4..ec0e49729f4 100644 --- a/include/caffe/opencl_syncedmem.hpp +++ b/include/caffe/opencl_syncedmem.hpp @@ -48,33 +48,24 @@ inline void opencl_aligned_free(void* ptr) { #endif } -class OpenCLSyncedMemory { +class OpenCLSyncedMemory : public AbstractSyncedMemory { public: - OpenCLSyncedMemory() - : shared_host_ptr_(NULL), mapped_device_ptr_(NULL), size_(0), head_(UNINITIALIZED), - own_cpu_data_(false) {} - explicit OpenCLSyncedMemory(size_t size) - : shared_host_ptr_(NULL), mapped_device_ptr_(NULL), size_(size), head_(UNINITIALIZED), - own_cpu_data_(false) {} + OpenCLSyncedMemory() : AbstractSyncedMemory() {} + explicit OpenCLSyncedMemory(size_t size) : AbstractSyncedMemory(size) {} ~OpenCLSyncedMemory(); const void* cpu_data(); void set_cpu_data(void* data); const void* gpu_data(); void* mutable_cpu_data(); void* mutable_gpu_data(); - enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED }; - SyncedHead head() { return head_; } - size_t size() { return size_; } - - private: + protected: void to_cpu(); void to_gpu(); + + private: void* shared_host_ptr_; void* mapped_device_ptr_; cl_mem device_mem_; - size_t size_; - SyncedHead head_; - bool own_cpu_data_; DISABLE_COPY_AND_ASSIGN(OpenCLSyncedMemory); }; // class OpenCLSyncedMemory diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp index bed55c3806e..4c78fa5549e 100644 --- a/include/caffe/syncedmem.hpp +++ b/include/caffe/syncedmem.hpp @@ -31,33 +31,50 @@ inline void CaffeFreeHost(void* ptr) { free(ptr); } - -class SyncedMemory { +class AbstractSyncedMemory { public: - SyncedMemory() + AbstractSyncedMemory() : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED), own_cpu_data_(false) {} - explicit SyncedMemory(size_t size) + explicit AbstractSyncedMemory(size_t size) : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED), own_cpu_data_(false) {} - ~SyncedMemory(); + virtual ~AbstractSyncedMemory() {} + enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED }; + virtual const void* cpu_data() = 0; + virtual void set_cpu_data(void* data) = 0; + virtual const void* gpu_data() = 0; + virtual void* mutable_cpu_data() = 0; + virtual void* mutable_gpu_data() = 0; + virtual SyncedHead head() { return head_; } + virtual size_t size() { return size_; } + + protected: + virtual void to_cpu() = 0; + virtual void to_gpu() = 0; + + protected: + void* cpu_ptr_; + void* gpu_ptr_; + size_t size_; + SyncedHead head_; + bool own_cpu_data_; +}; + +class SyncedMemory : public AbstractSyncedMemory { + public: + SyncedMemory() : AbstractSyncedMemory() {} + explicit SyncedMemory(size_t size) : AbstractSyncedMemory(size) {} + virtual ~SyncedMemory(); const void* cpu_data(); void set_cpu_data(void* data); const void* gpu_data(); void* mutable_cpu_data(); void* mutable_gpu_data(); - enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED }; - SyncedHead head() { return head_; } - size_t size() { return size_; } - private: + protected: void to_cpu(); void to_gpu(); - void* cpu_ptr_; - void* gpu_ptr_; - size_t size_; - SyncedHead head_; - bool own_cpu_data_; DISABLE_COPY_AND_ASSIGN(SyncedMemory); }; // class SyncedMemory diff --git a/src/caffe/opencl_syncedmem.cpp b/src/caffe/opencl_syncedmem.cpp index a082f17ba10..d8f68a5e026 100644 --- a/src/caffe/opencl_syncedmem.cpp +++ b/src/caffe/opencl_syncedmem.cpp @@ -8,7 +8,7 @@ namespace caffe { OpenCLSyncedMemory::~OpenCLSyncedMemory() { - if (shared_host_ptr_ && own_cpu_data_) { + if (shared_host_ptr_ && this->own_cpu_data_) { opencl_aligned_free(shared_host_ptr_); shared_host_ptr_ = NULL; } @@ -21,22 +21,22 @@ OpenCLSyncedMemory::~OpenCLSyncedMemory() { } inline void OpenCLSyncedMemory::to_cpu() { - switch (head_) { + switch (this->head_) { case UNINITIALIZED: - opencl_aligned_malloc(&shared_host_ptr_, &size_); - memset(shared_host_ptr_, 0, size_); - head_ = HEAD_AT_CPU; - own_cpu_data_ = true; + opencl_aligned_malloc(&shared_host_ptr_, &(this->size_)); + memset(shared_host_ptr_, 0, this->size_); + this->head_ = HEAD_AT_CPU; + this->own_cpu_data_ = true; break; case HEAD_AT_GPU: if (shared_host_ptr_ == NULL) { - opencl_aligned_malloc(&shared_host_ptr_, &size_); - own_cpu_data_ = true; + opencl_aligned_malloc(&shared_host_ptr_, &(this->size_)); + this->own_cpu_data_ = true; } CL_CHECK(clEnqueueReadBuffer( CaffeOpenCL::queue(), device_mem_, CL_TRUE, 0, - size_, shared_host_ptr_, 0, NULL, NULL)); - head_ = SYNCED; + this->size_, shared_host_ptr_, 0, NULL, NULL)); + this->head_ = SYNCED; break; case HEAD_AT_CPU: case SYNCED: @@ -45,35 +45,35 @@ inline void OpenCLSyncedMemory::to_cpu() { } inline void OpenCLSyncedMemory::to_gpu() { - switch (head_) { + switch (this->head_) { case UNINITIALIZED: /* * http://streamcomputing.eu/blog/2013-02-03/opencl-basics-flags-for-the-creating-memory-objects/ */ - opencl_aligned_malloc(&shared_host_ptr_, &size_); + opencl_aligned_malloc(&shared_host_ptr_, &(this->size_)); cl_int error; device_mem_ = clCreateBuffer( CaffeOpenCL::context(), CL_MEM_USE_HOST_PTR, - size_, shared_host_ptr_, &error); + this->size_, shared_host_ptr_, &error); CL_CHECK(error); - head_ = HEAD_AT_GPU; + this->head_ = HEAD_AT_GPU; break; case HEAD_AT_CPU: if (mapped_device_ptr_ == NULL) { cl_int error; device_mem_ = clCreateBuffer( CaffeOpenCL::context(), CL_MEM_USE_HOST_PTR, - size_, shared_host_ptr_, &error); + this->size_, shared_host_ptr_, &error); CL_CHECK(error); mapped_device_ptr_ = clEnqueueMapBuffer( CaffeOpenCL::queue(), device_mem_, CL_TRUE, - CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, &error); + CL_MAP_READ | CL_MAP_WRITE, 0, this->size_, 0, NULL, NULL, &error); CL_CHECK(error); } CL_CHECK(clEnqueueWriteBuffer( CaffeOpenCL::queue(), device_mem_, CL_TRUE, 0, - size_, shared_host_ptr_, 0, NULL, NULL)); - head_ = SYNCED; + this->size_, shared_host_ptr_, 0, NULL, NULL)); + this->head_ = SYNCED; break; case HEAD_AT_GPU: case SYNCED: @@ -88,12 +88,12 @@ const void* OpenCLSyncedMemory::cpu_data() { void OpenCLSyncedMemory::set_cpu_data(void* data) { CHECK(data); - if (own_cpu_data_) { + if (this->own_cpu_data_) { CaffeFreeHost(shared_host_ptr_); } shared_host_ptr_ = data; - head_ = HEAD_AT_CPU; - own_cpu_data_ = false; + this->head_ = HEAD_AT_CPU; + this->own_cpu_data_ = false; } const void* OpenCLSyncedMemory::gpu_data() { @@ -101,12 +101,12 @@ const void* OpenCLSyncedMemory::gpu_data() { cl_int error; mapped_device_ptr_ = clEnqueueMapBuffer( CaffeOpenCL::queue(), device_mem_, CL_TRUE, - CL_MAP_WRITE, 0, size_, 0, NULL, NULL, &error); + CL_MAP_WRITE, 0, this->size_, 0, NULL, NULL, &error); CL_CHECK(error); CL_CHECK(clEnqueueUnmapMemObject( CaffeOpenCL::queue(), device_mem_, mapped_device_ptr_, 0, NULL, NULL)); - return (const void*)mapped_device_ptr_; + return (const void*)(mapped_device_ptr_); } void* OpenCLSyncedMemory::mutable_cpu_data() { @@ -119,7 +119,7 @@ void* OpenCLSyncedMemory::mutable_gpu_data() { cl_int error; mapped_device_ptr_ = clEnqueueMapBuffer( CaffeOpenCL::queue(), device_mem_, CL_TRUE, - CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, &error); + CL_MAP_READ | CL_MAP_WRITE, 0, this->size_, 0, NULL, NULL, &error); CL_CHECK(error); CL_CHECK(clEnqueueUnmapMemObject( CaffeOpenCL::queue(), device_mem_, mapped_device_ptr_, From 26917dca85cc8be3e2ab89cc66a542c23893d8c7 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Fri, 20 Jun 2014 09:37:20 +0800 Subject: [PATCH 70/75] Add the factory function to produce synced memory --- include/caffe/syncedmem_factory.hpp | 17 +++++++++++++++++ src/caffe/syncedmem.cpp | 1 - src/caffe/syncedmem_factory.cpp | 24 ++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 include/caffe/syncedmem_factory.hpp create mode 100644 src/caffe/syncedmem_factory.cpp diff --git a/include/caffe/syncedmem_factory.hpp b/include/caffe/syncedmem_factory.hpp new file mode 100644 index 00000000000..08ff2f0da09 --- /dev/null +++ b/include/caffe/syncedmem_factory.hpp @@ -0,0 +1,17 @@ +// Copyright 2014 BVLC and contributors. + +#ifndef CAFFE_SYNCEDMEM_FACTORY_HPP_ +#define CAFFE_SYNCEDMEM_FACTORY_HPP_ + +#include "caffe/common.hpp" +#include "caffe/syncedmem.hpp" +#include "caffe/opencl_syncedmem.hpp" + +namespace caffe { + +// The SyncedMemory factory function +AbstractSyncedMemory* GetSyncedMemory(const size_t size = 0); + +} // namespace caffe + +#endif // CAFFE_SYNCEDMEM_FACTORY_HPP_ diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index fec37d6e9ec..a38ff8d9028 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -93,6 +93,5 @@ void* SyncedMemory::mutable_gpu_data() { return gpu_ptr_; } - } // namespace caffe diff --git a/src/caffe/syncedmem_factory.cpp b/src/caffe/syncedmem_factory.cpp new file mode 100644 index 00000000000..5e86403e929 --- /dev/null +++ b/src/caffe/syncedmem_factory.cpp @@ -0,0 +1,24 @@ +// Copyright 2014 BVLC and contributors. + +#include "caffe/syncedmem_factory.hpp" + +namespace caffe { + +AbstractSyncedMemory* GetSyncedMemory(const size_t size) { + switch (Caffe::mode()) { + case Caffe::CPU: + case Caffe::GPU: + return new SyncedMemory(size); +#ifdef USE_OPENCL + case Caffe::OPENCL_CPU: + case Caffe::OPENCL_GPU: + return new OpenCLSyncedMemory(size); +#endif + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast(NULL); + } +} + +} // namespace caffe + From c31fbaaddae382a1f9b92c4630b8251a1aab62ab Mon Sep 17 00:00:00 2001 From: Kai Li Date: Thu, 19 Jun 2014 22:21:09 +0800 Subject: [PATCH 71/75] Add tests for OpenCL synced memory --- src/caffe/test/test_opencl_syncedmem.cpp | 90 ++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 src/caffe/test/test_opencl_syncedmem.cpp diff --git a/src/caffe/test/test_opencl_syncedmem.cpp b/src/caffe/test/test_opencl_syncedmem.cpp new file mode 100644 index 00000000000..0724298356b --- /dev/null +++ b/src/caffe/test/test_opencl_syncedmem.cpp @@ -0,0 +1,90 @@ +// Copyright 2014 BVLC and contributors. + +#include +#include + +#include "cuda_runtime.h" +#include "gtest/gtest.h" +#include "caffe/common.hpp" +#include "caffe/opencl_syncedmem.hpp" + +#include "caffe/test/test_caffe_main.hpp" + +namespace caffe { + +class OpenCLSyncedMemoryTest : public ::testing::Test {}; + +TEST_F(OpenCLSyncedMemoryTest, TestInitialization) { + OpenCLSyncedMemory mem(10); + EXPECT_EQ(mem.head(), OpenCLSyncedMemory::UNINITIALIZED); + EXPECT_EQ(mem.size(), 10); + OpenCLSyncedMemory* p_mem = new OpenCLSyncedMemory(10 * sizeof(float)); + EXPECT_EQ(p_mem->size(), 10 * sizeof(float)); + delete p_mem; +} + +TEST_F(OpenCLSyncedMemoryTest, TestAllocation) { + OpenCLSyncedMemory mem(10); + EXPECT_TRUE(mem.cpu_data()); + EXPECT_TRUE(mem.gpu_data()); + EXPECT_TRUE(mem.mutable_cpu_data()); + EXPECT_TRUE(mem.mutable_gpu_data()); +} + +TEST_F(OpenCLSyncedMemoryTest, TestCPUWrite) { + OpenCLSyncedMemory mem(10); + void* cpu_data = mem.mutable_cpu_data(); + EXPECT_EQ(mem.head(), OpenCLSyncedMemory::HEAD_AT_CPU); + memset(cpu_data, 1, mem.size()); + for (int i = 0; i < mem.size(); ++i) { + EXPECT_EQ((reinterpret_cast(cpu_data))[i], 1); + } + const void* gpu_data = mem.gpu_data(); + EXPECT_EQ(mem.head(), OpenCLSyncedMemory::SYNCED); + // check if values are the same + char* recovered_value = new char[10]; + cudaMemcpy(reinterpret_cast(recovered_value), gpu_data, 10, + cudaMemcpyDeviceToHost); + for (int i = 0; i < mem.size(); ++i) { + EXPECT_EQ((reinterpret_cast(recovered_value))[i], 1); + } + // do another round + cpu_data = mem.mutable_cpu_data(); + EXPECT_EQ(mem.head(), OpenCLSyncedMemory::HEAD_AT_CPU); + memset(cpu_data, 2, mem.size()); + for (int i = 0; i < mem.size(); ++i) { + EXPECT_EQ((reinterpret_cast(cpu_data))[i], 2); + } + gpu_data = mem.gpu_data(); + EXPECT_EQ(mem.head(), OpenCLSyncedMemory::SYNCED); + // check if values are the same + cudaMemcpy(reinterpret_cast(recovered_value), gpu_data, 10, + cudaMemcpyDeviceToHost); + for (int i = 0; i < mem.size(); ++i) { + EXPECT_EQ((reinterpret_cast(recovered_value))[i], 2); + } + delete[] recovered_value; +} + +TEST_F(OpenCLSyncedMemoryTest, TestGPUWrite) { + OpenCLSyncedMemory mem(10); + void* gpu_data = mem.mutable_gpu_data(); + EXPECT_EQ(mem.head(), OpenCLSyncedMemory::HEAD_AT_GPU); + CUDA_CHECK(cudaMemset(gpu_data, 1, mem.size())); + const void* cpu_data = mem.cpu_data(); + for (int i = 0; i < mem.size(); ++i) { + EXPECT_EQ((reinterpret_cast(cpu_data))[i], 1); + } + EXPECT_EQ(mem.head(), OpenCLSyncedMemory::SYNCED); + + gpu_data = mem.mutable_gpu_data(); + EXPECT_EQ(mem.head(), OpenCLSyncedMemory::HEAD_AT_GPU); + CUDA_CHECK(cudaMemset(gpu_data, 2, mem.size())); + cpu_data = mem.cpu_data(); + for (int i = 0; i < mem.size(); ++i) { + EXPECT_EQ((reinterpret_cast(cpu_data))[i], 2); + } + EXPECT_EQ(mem.head(), OpenCLSyncedMemory::SYNCED); +} + +} // namespace caffe From 2dd1d7d1424707f9538ad8a612dc387c6e33e1b5 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sat, 28 Jun 2014 09:13:10 +0800 Subject: [PATCH 72/75] Opt out OpenCL codes with the macro USE_OPENCL --- include/caffe/opencl_syncedmem.hpp | 5 +++-- include/caffe/util/opencl_device.hpp | 3 +++ include/caffe/util/opencl_math_functions.hpp | 3 +++ src/caffe/opencl_syncedmem.cpp | 5 +++-- src/caffe/util/opencl_device.cpp | 3 +++ src/caffe/util/opencl_math_functions.cpp | 3 +++ 6 files changed, 18 insertions(+), 4 deletions(-) diff --git a/include/caffe/opencl_syncedmem.hpp b/include/caffe/opencl_syncedmem.hpp index ec0e49729f4..22d045ce3d4 100644 --- a/include/caffe/opencl_syncedmem.hpp +++ b/include/caffe/opencl_syncedmem.hpp @@ -1,5 +1,6 @@ // Copyright 2014 BVLC and contributors. -//#ifdef USE_OPENCL + +#ifdef USE_OPENCL #ifndef CAFFE_OPENCL_SYNCEDMEM_HPP_ #define CAFFE_OPENCL_SYNCEDMEM_HPP_ @@ -73,4 +74,4 @@ class OpenCLSyncedMemory : public AbstractSyncedMemory { } // namespace caffe #endif // CAFFE_OPENCL_SYNCEDMEM_HPP_ -//#endif // USE_OPENCL +#endif // USE_OPENCL diff --git a/include/caffe/util/opencl_device.hpp b/include/caffe/util/opencl_device.hpp index 16d837e5234..d976d580916 100644 --- a/include/caffe/util/opencl_device.hpp +++ b/include/caffe/util/opencl_device.hpp @@ -1,5 +1,6 @@ // Copyright 2014 BVLC and contributors. +#ifdef USE_OPENCL #ifndef CAFFE_UTIL_OPENCL_DEVICE_H_ #define CAFFE_UTIL_OPENCL_DEVICE_H_ @@ -173,3 +174,5 @@ class OpenCLDevice : public Device { } // namespace caffe #endif // CAFFE_UTIL_OPENCL_DEVICE_H_ +#endif // USE_OPENCL + diff --git a/include/caffe/util/opencl_math_functions.hpp b/include/caffe/util/opencl_math_functions.hpp index 434cb88ef3f..1927c58cc80 100644 --- a/include/caffe/util/opencl_math_functions.hpp +++ b/include/caffe/util/opencl_math_functions.hpp @@ -1,5 +1,6 @@ // Copyright 2014 BVLC and contributors. +#ifdef USE_OPENCL #ifndef CAFFE_UTIL_OPENCL_MATH_FUNCTIONS_H_ #define CAFFE_UTIL_OPENCL_MATH_FUNCTIONS_H_ @@ -286,3 +287,5 @@ void caffe_opencl_div(const int n, const Dtype* a, #endif // CAFFE_UTIL_MATH_FUNCTIONS_H_ +#endif // USE_OPENCL + diff --git a/src/caffe/opencl_syncedmem.cpp b/src/caffe/opencl_syncedmem.cpp index d8f68a5e026..f14d4e8177d 100644 --- a/src/caffe/opencl_syncedmem.cpp +++ b/src/caffe/opencl_syncedmem.cpp @@ -1,5 +1,6 @@ // Copyright 2014 BVLC and contributors. -//#ifdef USE_OPENCL + +#ifdef USE_OPENCL #include #include "caffe/common.hpp" @@ -129,4 +130,4 @@ void* OpenCLSyncedMemory::mutable_gpu_data() { } // namespace caffe -//#endif // USE_OPENCL +#endif // USE_OPENCL diff --git a/src/caffe/util/opencl_device.cpp b/src/caffe/util/opencl_device.cpp index 8ca9861e111..a1099e11100 100644 --- a/src/caffe/util/opencl_device.cpp +++ b/src/caffe/util/opencl_device.cpp @@ -1,5 +1,6 @@ // Copyright 2014 BVLC and contributors. +#ifdef USE_OPENCL #include "caffe/common.hpp" #include "caffe/util/opencl_device.hpp" #include "caffe/util/opencl_math_functions.hpp" @@ -378,3 +379,5 @@ const char* clblasGetErrorString(clblasStatus status) { } } // namespace caffe + +#endif // USE_OPENCL diff --git a/src/caffe/util/opencl_math_functions.cpp b/src/caffe/util/opencl_math_functions.cpp index fecd2fca3b5..7f9cbc0f3a3 100644 --- a/src/caffe/util/opencl_math_functions.cpp +++ b/src/caffe/util/opencl_math_functions.cpp @@ -1,5 +1,6 @@ // Copyright 2014 BVLC and contributors. +#ifdef USE_OPENCL //#include "caffe/common.hpp" #include "caffe/util/opencl_math_functions.hpp" @@ -229,3 +230,5 @@ DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(mul, y[i] = a[i] * b[i]); DEFINE_AND_INSTANTIATE_OPENCL_BINARY_FUNC(div, y[i] = a[i] / b[i]); } // namespace caffe + +#endif // USE_OPENCL From 16054312d3498c3933f3bc11b18b6da0687112a5 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sat, 28 Jun 2014 09:22:13 +0800 Subject: [PATCH 73/75] Replace all the blob data getters with the device independent version --- include/caffe/data_layers.hpp | 30 ++++----- include/caffe/filler.hpp | 16 ++--- include/caffe/loss_layers.hpp | 66 +++++++------------ include/caffe/neuron_layers.hpp | 2 +- include/caffe/syncedmem.hpp | 3 + include/caffe/vision_layers.hpp | 26 ++++---- src/caffe/layers/accuracy_layer.cpp | 8 +-- src/caffe/layers/argmax_layer.cpp | 6 +- src/caffe/layers/bnll_layer.cpp | 10 +-- src/caffe/layers/concat_layer.cpp | 4 +- src/caffe/layers/conv_layer.cpp | 6 +- src/caffe/layers/dropout_layer.cpp | 14 ++-- src/caffe/layers/dummy_data_layer.cpp | 2 +- src/caffe/layers/euclidean_loss_layer.cpp | 23 +++---- src/caffe/layers/euclidean_loss_layer.cu | 45 ------------- src/caffe/layers/hdf5_output_layer.cpp | 8 --- src/caffe/layers/hinge_loss_layer.cpp | 22 +++---- src/caffe/layers/infogain_loss_layer.cpp | 20 +++--- src/caffe/layers/inner_product_layer.cpp | 8 +-- src/caffe/layers/lrn_layer.cpp | 24 +++---- src/caffe/layers/memory_data_layer.cpp | 2 +- .../multinomial_logistic_loss_layer.cpp | 16 ++--- src/caffe/layers/pooling_layer.cpp | 16 ++--- src/caffe/layers/relu_layer.cpp | 10 +-- .../sigmoid_cross_entropy_loss_layer.cpp | 6 +- src/caffe/layers/sigmoid_layer.cpp | 10 +-- src/caffe/layers/softmax_layer.cpp | 22 +++---- src/caffe/layers/softmax_loss_layer.cpp | 12 ++-- src/caffe/layers/split_layer.cpp | 8 +-- src/caffe/layers/split_layer.cu | 37 ----------- src/caffe/layers/tanh_layer.cpp | 10 +-- src/caffe/layers/threshold_layer.cpp | 4 +- src/caffe/net.cpp | 17 +---- src/caffe/solver.cpp | 65 ++++++------------ 34 files changed, 213 insertions(+), 365 deletions(-) delete mode 100644 src/caffe/layers/euclidean_loss_layer.cu delete mode 100644 src/caffe/layers/split_layer.cu diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp index 5151c278e11..45112af72bc 100644 --- a/include/caffe/data_layers.hpp +++ b/include/caffe/data_layers.hpp @@ -34,7 +34,7 @@ class HDF5OutputLayer : public Layer { virtual Dtype Forward(const vector*>& bottom, vector*>* top); virtual void Backward(const vector*>& top, - const bool propagate_down, vector*>* bottom) { return; } + const vector& propagate_down, vector*>* bottom) { return; } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_HDF5_OUTPUT; @@ -66,7 +66,7 @@ class HDF5DataLayer : public Layer { virtual Dtype Forward(const vector*>& bottom, vector*>* top); virtual void Backward(const vector*>& top, - const bool propagate_down, vector*>* bottom) { return; } + const vector& propagate_down, vector*>* bottom) { return; } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_HDF5_DATA; @@ -106,7 +106,7 @@ class DataLayer : public Layer { virtual Dtype Forward(const vector*>& bottom, vector*>* top); virtual void Backward(const vector*>& top, - const bool propagate_down, vector*>* bottom) { return; } + const vector& propagate_down, vector*>* bottom) { return; } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_DATA; @@ -157,15 +157,12 @@ class DummyDataLayer : public Layer { } virtual inline int ExactNumBottomBlobs() const { return 0; } virtual inline int MinTopBlobs() const { return 1; } - - protected: - virtual Dtype Forward_cpu(const vector*>& bottom, + virtual Dtype Forward(const vector*>& bottom, vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - virtual void Backward_gpu(const vector*>& top, + virtual void Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) {} + protected: vector > > fillers_; vector refill_; }; @@ -188,7 +185,7 @@ class ImageDataLayer : public Layer { virtual Dtype Forward(const vector*>& bottom, vector*>* top); virtual void Backward(const vector*>& top, - const bool propagate_down, vector*>* bottom) { return; } + const vector& propagate_down, vector*>* bottom) { return; } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_IMAGE_DATA; @@ -226,6 +223,10 @@ class MemoryDataLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom) {} virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_MEMORY_DATA; @@ -242,13 +243,6 @@ class MemoryDataLayer : public Layer { int batch_size() { return batch_size_; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) {} - Dtype* data_; Dtype* labels_; int datum_channels_; @@ -278,7 +272,7 @@ class WindowDataLayer : public Layer { virtual Dtype Forward(const vector*>& bottom, vector*>* top); virtual void Backward(const vector*>& top, - const bool propagate_down, vector*>* bottom) { return; } + const vector& propagate_down, vector*>* bottom) { return; } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_WINDOW_DATA; diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp index 242f11a3513..acf34270f6a 100644 --- a/include/caffe/filler.hpp +++ b/include/caffe/filler.hpp @@ -34,7 +34,7 @@ class ConstantFiller : public Filler { explicit ConstantFiller(const FillerParameter& param) : Filler(param) {} virtual void Fill(Blob* blob) { - Dtype* data = blob->mutable_cpu_data(); + Dtype* data = blob->mutable_data(); const int count = blob->count(); const Dtype value = this->filler_param_.value(); CHECK(count); @@ -54,7 +54,7 @@ class UniformFiller : public Filler { virtual void Fill(Blob* blob) { CHECK(blob->count()); caffe_rng_uniform(blob->count(), Dtype(this->filler_param_.min()), - Dtype(this->filler_param_.max()), blob->mutable_cpu_data()); + Dtype(this->filler_param_.max()), blob->mutable_data()); CHECK_EQ(this->filler_param_.sparse(), -1) << "Sparsity not supported by this Filler."; } @@ -66,10 +66,10 @@ class GaussianFiller : public Filler { explicit GaussianFiller(const FillerParameter& param) : Filler(param) {} virtual void Fill(Blob* blob) { - Dtype* data = blob->mutable_cpu_data(); + Dtype* data = blob->mutable_data(); CHECK(blob->count()); caffe_rng_gaussian(blob->count(), Dtype(this->filler_param_.mean()), - Dtype(this->filler_param_.std()), blob->mutable_cpu_data()); + Dtype(this->filler_param_.std()), blob->mutable_data()); int sparse = this->filler_param_.sparse(); CHECK_GE(sparse, -1); if (sparse >= 0) { @@ -82,7 +82,7 @@ class GaussianFiller : public Filler { int num_inputs = blob->height(); Dtype non_zero_probability = Dtype(sparse) / Dtype(num_inputs); rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int))); - int* mask = reinterpret_cast(rand_vec_->mutable_cpu_data()); + int* mask = reinterpret_cast(rand_vec_->mutable_data()); caffe_rng_bernoulli(blob->count(), non_zero_probability, mask); for (int i = 0; i < blob->count(); ++i) { data[i] *= mask[i]; @@ -100,9 +100,9 @@ class PositiveUnitballFiller : public Filler { explicit PositiveUnitballFiller(const FillerParameter& param) : Filler(param) {} virtual void Fill(Blob* blob) { - Dtype* data = blob->mutable_cpu_data(); + Dtype* data = blob->mutable_data(); DCHECK(blob->count()); - caffe_rng_uniform(blob->count(), 0, 1, blob->mutable_cpu_data()); + caffe_rng_uniform(blob->count(), 0, 1, blob->mutable_data()); // We expect the filler to not be called very frequently, so we will // just use a simple implementation int dim = blob->count() / blob->num(); @@ -139,7 +139,7 @@ class XavierFiller : public Filler { int fan_in = blob->count() / blob->num(); Dtype scale = sqrt(Dtype(3) / fan_in); caffe_rng_uniform(blob->count(), -scale, scale, - blob->mutable_cpu_data()); + blob->mutable_data()); CHECK_EQ(this->filler_param_.sparse(), -1) << "Sparsity not supported by this Filler."; } diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp index 2d9cb39b3ab..ab8a9caa5f0 100644 --- a/include/caffe/loss_layers.hpp +++ b/include/caffe/loss_layers.hpp @@ -63,6 +63,10 @@ class SoftmaxWithLossLayer : public Layer { : Layer(param), softmax_layer_(new SoftmaxLayer(param)) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_SOFTMAX_LOSS; @@ -75,15 +79,6 @@ class SoftmaxWithLossLayer : public Layer { } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - shared_ptr > softmax_layer_; // prob stores the output probability of the layer. Blob prob_; @@ -106,7 +101,7 @@ class SigmoidCrossEntropyLossLayer : public LossLayer { virtual Dtype Forward(const vector*>& bottom, vector*>* top); virtual void Backward(const vector*>& top, - const bool propagate_down, vector*>* bottom); + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_SIGMOID_CROSS_ENTROPY_LOSS; @@ -134,6 +129,10 @@ class EuclideanLossLayer : public LossLayer { : LossLayer(param), diff_() {} virtual void FurtherSetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_EUCLIDEAN_LOSS; @@ -145,15 +144,6 @@ class EuclideanLossLayer : public LossLayer { } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual Dtype Forward_gpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - Blob diff_; }; @@ -166,17 +156,16 @@ class InfogainLossLayer : public LossLayer { : LossLayer(param), infogain_() {} virtual void FurtherSetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_INFOGAIN_LOSS; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); - Blob infogain_; }; @@ -191,11 +180,9 @@ class HingeLossLayer : public LossLayer { virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_HINGE_LOSS; } - - protected: - virtual Dtype Forward_cpu(const vector*>& bottom, + virtual Dtype Forward(const vector*>& bottom, vector*>* top); - virtual void Backward_cpu(const vector*>& top, + virtual void Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom); }; @@ -208,16 +195,14 @@ class MultinomialLogisticLossLayer : public LossLayer { : LossLayer(param) {} virtual void FurtherSetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS; } - - protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom); }; /* AccuracyLayer @@ -231,6 +216,12 @@ class AccuracyLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom) { + NOT_IMPLEMENTED; + } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_ACCURACY; @@ -240,13 +231,6 @@ class AccuracyLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 1; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - NOT_IMPLEMENTED; - } - int top_k_; }; diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp index 86eb15660f8..16c8ae64835 100644 --- a/include/caffe/neuron_layers.hpp +++ b/include/caffe/neuron_layers.hpp @@ -123,7 +123,7 @@ class PowerLayer : public NeuronLayer { virtual Dtype Forward(const vector*>& bottom, vector*>* top); virtual void Backward(const vector*>& top, - const bool propagate_down, vector*>* bottom); + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_POWER; diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp index 4c78fa5549e..ebe29542a17 100644 --- a/include/caffe/syncedmem.hpp +++ b/include/caffe/syncedmem.hpp @@ -49,6 +49,9 @@ class AbstractSyncedMemory { virtual SyncedHead head() { return head_; } virtual size_t size() { return size_; } + const void* const_data() const { return NULL; } + void* mutable_data() { return NULL;} + protected: virtual void to_cpu() = 0; virtual void to_gpu() = 0; diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 1e117c714d2..abe6f0c6483 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -33,6 +33,12 @@ class ArgMaxLayer : public Layer { : Layer(param) {} virtual void SetUp(const vector*>& bottom, vector*>* top); + virtual Dtype Forward(const vector*>& bottom, + vector*>* top); + virtual void Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom) { + NOT_IMPLEMENTED; + } virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_ARGMAX; @@ -41,12 +47,6 @@ class ArgMaxLayer : public Layer { virtual inline int ExactNumTopBlobs() const { return 1; } protected: - virtual Dtype Forward_cpu(const vector*>& bottom, - vector*>* top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - NOT_IMPLEMENTED; - } bool out_max_val_; }; @@ -64,7 +64,7 @@ class ConcatLayer : public Layer { virtual Dtype Forward(const vector*>& bottom, vector*>* top); virtual void Backward(const vector*>& top, - const bool propagate_down, + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { @@ -95,7 +95,7 @@ class ConvolutionLayer : public Layer { virtual Dtype Forward(const vector*>& bottom, vector*>* top); virtual void Backward(const vector*>& top, - const bool propagate_down, vector*>* bottom); + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_CONVOLUTION; @@ -134,7 +134,7 @@ class EltwiseLayer : public Layer { virtual Dtype Forward(const vector*>& bottom, vector*>* top); virtual void Backward(const vector*>& top, - const bool propagate_down, vector*>* bottom); + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_ELTWISE; @@ -159,7 +159,7 @@ class FlattenLayer : public Layer { virtual Dtype Forward(const vector*>& bottom, vector*>* top); virtual void Backward(const vector*>& top, - const bool propagate_down, vector*>* bottom); + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_FLATTEN; @@ -183,7 +183,7 @@ class Im2colLayer : public Layer { virtual Dtype Forward(const vector*>& bottom, vector*>* top); virtual void Backward(const vector*>& top, - const bool propagate_down, vector*>* bottom); + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_IM2COL; @@ -212,7 +212,7 @@ class InnerProductLayer : public Layer { virtual Dtype Forward(const vector*>& bottom, vector*>* top); virtual void Backward(const vector*>& top, - const bool propagate_down, vector*>* bottom); + const vector& propagate_down, vector*>* bottom); virtual inline LayerParameter_LayerType type() const { return LayerParameter_LayerType_INNER_PRODUCT; @@ -388,7 +388,7 @@ class SplitLayer : public Layer { virtual Dtype Forward(const vector*>& bottom, vector*>* top); virtual void Backward(const vector*>& top, - const bool propagate_down, vector*>* bottom); + const vector& propagate_down, vector*>* bottom); protected: int count_; diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp index 409965519ca..4c68c4a8fd4 100644 --- a/src/caffe/layers/accuracy_layer.cpp +++ b/src/caffe/layers/accuracy_layer.cpp @@ -30,11 +30,11 @@ void AccuracyLayer::SetUp( } template -Dtype AccuracyLayer::Forward_cpu(const vector*>& bottom, +Dtype AccuracyLayer::Forward(const vector*>& bottom, vector*>* top) { Dtype accuracy = 0; - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* bottom_label = bottom[1]->cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + const Dtype* bottom_label = bottom[1]->const_data(); int num = bottom[0]->num(); int dim = bottom[0]->count() / bottom[0]->num(); vector maxval(top_k_+1); @@ -62,7 +62,7 @@ Dtype AccuracyLayer::Forward_cpu(const vector*>& bottom, } // LOG(INFO) << "Accuracy: " << accuracy; - (*top)[0]->mutable_cpu_data()[0] = accuracy / num; + (*top)[0]->mutable_data()[0] = accuracy / num; // Accuracy layer should not be used as a loss function. return Dtype(0); diff --git a/src/caffe/layers/argmax_layer.cpp b/src/caffe/layers/argmax_layer.cpp index cc31c0f52d8..1c0c402d5ea 100644 --- a/src/caffe/layers/argmax_layer.cpp +++ b/src/caffe/layers/argmax_layer.cpp @@ -24,10 +24,10 @@ void ArgMaxLayer::SetUp(const vector*>& bottom, } template -Dtype ArgMaxLayer::Forward_cpu(const vector*>& bottom, +Dtype ArgMaxLayer::Forward(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); int num = bottom[0]->num(); int dim = bottom[0]->count() / bottom[0]->num(); for (int i = 0; i < num; ++i) { diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp index 95e6bd8748c..3ff9e3f99df 100644 --- a/src/caffe/layers/bnll_layer.cpp +++ b/src/caffe/layers/bnll_layer.cpp @@ -15,8 +15,8 @@ const float kBNLL_THRESHOLD = 50.; template Dtype BNLLLayer::Forward_cpu(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); const int count = bottom[0]->count(); for (int i = 0; i < count; ++i) { top_data[i] = bottom_data[i] > 0 ? @@ -31,9 +31,9 @@ void BNLLLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[0]) { - const Dtype* bottom_data = (*bottom)[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + const Dtype* bottom_data = (*bottom)[0]->const_data(); + const Dtype* top_diff = top[0]->const_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); const int count = (*bottom)[0]->count(); Dtype expval; for (int i = 0; i < count; ++i) { diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp index af1bc29748e..a2c9aed8b94 100644 --- a/src/caffe/layers/concat_layer.cpp +++ b/src/caffe/layers/concat_layer.cpp @@ -73,8 +73,8 @@ Dtype ConcatLayer::Forward(const vector*>& bottom, } template -void ConcatLayer::Backward_xpu(const vector*>& top, - const bool propagate_down, vector*>* bottom) { +void ConcatLayer::Backward(const vector*>& top, + const vector& propagate_down, vector*>* bottom) { const Dtype* top_diff = top[0]->const_diff(); if (concat_dim_ == 0) { int offset_num = 0; diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 02871c29d05..8fba11fa2fd 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -68,7 +68,7 @@ void ConvolutionLayer::SetUp(const vector*>& bottom, if (bias_term_) { bias_multiplier_.reset(new SyncedMemory(N_ * sizeof(Dtype))); Dtype* bias_multiplier_data = - reinterpret_cast(bias_multiplier_->mutable_cpu_data()); + reinterpret_cast(bias_multiplier_->mutable_data()); for (int i = 0; i < N_; ++i) { bias_multiplier_data[i] = 1.; } @@ -101,7 +101,7 @@ Dtype ConvolutionLayer::Forward(const vector*>& bottom, if (bias_term_) { this->device_->gemm(CblasNoTrans, CblasNoTrans, num_output_, N_, 1, (Dtype)1., this->blobs_[1]->const_data(), - reinterpret_cast(bias_multiplier_->cpu_data()), + reinterpret_cast(bias_multiplier_->const_data()), (Dtype)1., top_data + (*top)[0]->offset(n)); } } @@ -127,7 +127,7 @@ void ConvolutionLayer::Backward(const vector*>& top, for (int n = 0; n < num_; ++n) { this->device_->gemv(CblasNoTrans, num_output_, N_, 1., top_diff + top[0]->offset(n), - reinterpret_cast(bias_multiplier_->cpu_data()), 1., + reinterpret_cast(bias_multiplier_->const_data()), 1., bias_diff); } } diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index e9a1a524d63..fa11c17ac5d 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -29,9 +29,9 @@ void DropoutLayer::SetUp(const vector*>& bottom, template Dtype DropoutLayer::Forward_cpu(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); - unsigned int* mask = rand_vec_->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); + int* mask = reinterpret_cast(rand_vec_->mutable_data()); const int count = bottom[0]->count(); if (Caffe::phase() == Caffe::TRAIN) { // Create random numbers @@ -40,7 +40,7 @@ Dtype DropoutLayer::Forward_cpu(const vector*>& bottom, top_data[i] = bottom_data[i] * mask[i] * scale_; } } else { - caffe_copy(bottom[0]->count(), bottom_data, top_data); + this->device_->copy(bottom[0]->count(), bottom_data, top_data); } return Dtype(0); } @@ -51,9 +51,9 @@ void DropoutLayer::Backward_cpu(const vector*>& top, vector*>* bottom) { CHECK(Caffe::phase() == Caffe::TRAIN); if (propagate_down[0]) { - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); - const unsigned int* mask = rand_vec_->cpu_data(); + const Dtype* top_diff = top[0]->const_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); + const unsigned int* mask = rand_vec_->const_data(); const int count = (*bottom)[0]->count(); for (int i = 0; i < count; ++i) { bottom_diff[i] = top_diff[i] * mask[i] * scale_; diff --git a/src/caffe/layers/dummy_data_layer.cpp b/src/caffe/layers/dummy_data_layer.cpp index 58044f4c952..a4b0b17a2e4 100644 --- a/src/caffe/layers/dummy_data_layer.cpp +++ b/src/caffe/layers/dummy_data_layer.cpp @@ -84,7 +84,7 @@ void DummyDataLayer::SetUp(const vector*>& bottom, } template -Dtype DummyDataLayer::Forward_cpu(const vector*>& bottom, +Dtype DummyDataLayer::Forward(const vector*>& bottom, vector*>* top) { for (int i = 0; i < top->size(); ++i) { const int filler_id = (fillers_.size() > 1) ? i : 0; diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp index 2478a514cac..2c1dbe5c101 100644 --- a/src/caffe/layers/euclidean_loss_layer.cpp +++ b/src/caffe/layers/euclidean_loss_layer.cpp @@ -20,34 +20,35 @@ void EuclideanLossLayer::FurtherSetUp( } template -Dtype EuclideanLossLayer::Forward_cpu(const vector*>& bottom, +Dtype EuclideanLossLayer::Forward(const vector*>& bottom, vector*>* top) { int count = bottom[0]->count(); - caffe_sub( + this->device_->sub( count, - bottom[0]->cpu_data(), - bottom[1]->cpu_data(), - diff_.mutable_cpu_data()); - Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data()); + bottom[0]->const_data(), + bottom[1]->const_data(), + diff_.mutable_data()); + Dtype dot; + this->device_->dot(count, diff_.const_data(), diff_.const_data(), &dot); Dtype loss = dot / bottom[0]->num() / Dtype(2); if (top->size() == 1) { - (*top)[0]->mutable_cpu_data()[0] = loss; + (*top)[0]->mutable_data()[0] = loss; } return loss; } template -void EuclideanLossLayer::Backward_cpu(const vector*>& top, +void EuclideanLossLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { for (int i = 0; i < 2; ++i) { if (propagate_down[i]) { const Dtype sign = (i == 0) ? 1 : -1; - caffe_cpu_axpby( + this->device_->axpby( (*bottom)[i]->count(), // count sign / (*bottom)[i]->num(), // alpha - diff_.cpu_data(), // a + diff_.const_data(), // a Dtype(0), // beta - (*bottom)[i]->mutable_cpu_diff()); // b + (*bottom)[i]->mutable_diff()); // b } } } diff --git a/src/caffe/layers/euclidean_loss_layer.cu b/src/caffe/layers/euclidean_loss_layer.cu deleted file mode 100644 index b070ea96ff8..00000000000 --- a/src/caffe/layers/euclidean_loss_layer.cu +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/util/io.hpp" - -namespace caffe { - -template -Dtype EuclideanLossLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - int count = bottom[0]->count(); - caffe_gpu_sub( - count, - bottom[0]->gpu_data(), - bottom[1]->gpu_data(), - diff_.mutable_gpu_data()); - Dtype dot; - caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot); - Dtype loss = dot / bottom[0]->num() / Dtype(2); - return loss; -} - -template -void EuclideanLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - for (int i = 0; i < 2; ++i) { - if (propagate_down[i]) { - const Dtype sign = (i == 0) ? 1 : -1; - caffe_gpu_axpby( - (*bottom)[i]->count(), // count - sign / (*bottom)[i]->num(), // alpha - diff_.gpu_data(), // a - Dtype(0), // beta - (*bottom)[i]->mutable_gpu_diff()); // b - } - } -} - -INSTANTIATE_CLASS(EuclideanLossLayer); - -} // namespace caffe diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp index 06b566ffe7d..0601cd9f473 100644 --- a/src/caffe/layers/hdf5_output_layer.cpp +++ b/src/caffe/layers/hdf5_output_layer.cpp @@ -41,14 +41,6 @@ void HDF5OutputLayer::SaveBlobs() { LOG(INFO) << "Successfully saved " << data_blob_.num() << " rows"; } -template -void HDF5OutputLayer::SetUp(const vector*>& bottom, - vector*>* top) { - // TODO: no limit on the number of blobs - CHECK_EQ(bottom.size(), 2) << "HDF5OutputLayer takes two blobs as input."; - CHECK_EQ(top->size(), 0) << "HDF5OutputLayer takes no output blobs."; -} - template Dtype HDF5OutputLayer::Forward(const vector*>& bottom, vector*>* top) { diff --git a/src/caffe/layers/hinge_loss_layer.cpp b/src/caffe/layers/hinge_loss_layer.cpp index 8097761d22b..528b111bba4 100644 --- a/src/caffe/layers/hinge_loss_layer.cpp +++ b/src/caffe/layers/hinge_loss_layer.cpp @@ -15,16 +15,16 @@ using std::max; namespace caffe { template -Dtype HingeLossLayer::Forward_cpu(const vector*>& bottom, +Dtype HingeLossLayer::Forward(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - const Dtype* label = bottom[1]->cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* bottom_diff = bottom[0]->mutable_diff(); + const Dtype* label = bottom[1]->const_data(); int num = bottom[0]->num(); int count = bottom[0]->count(); int dim = count / num; - caffe_copy(count, bottom_data, bottom_diff); + this->device_->copy(count, bottom_data, bottom_diff); for (int i = 0; i < num; ++i) { bottom_diff[i * dim + static_cast(label[i])] *= -1; } @@ -44,15 +44,15 @@ Dtype HingeLossLayer::Forward_cpu(const vector*>& bottom, } template -void HingeLossLayer::Backward_cpu(const vector*>& top, +void HingeLossLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[1]) { LOG(FATAL) << this->type_name() << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); - const Dtype* label = (*bottom)[1]->cpu_data(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); + const Dtype* label = (*bottom)[1]->const_data(); int num = (*bottom)[0]->num(); int count = (*bottom)[0]->count(); int dim = count / num; @@ -63,11 +63,11 @@ void HingeLossLayer::Backward_cpu(const vector*>& top, switch (this->layer_param_.hinge_loss_param().norm()) { case HingeLossParameter_Norm_L1: - caffe_cpu_sign(count, bottom_diff, bottom_diff); - caffe_scal(count, Dtype(1. / num), bottom_diff); + this->device_->sign(count, bottom_diff, bottom_diff); + this->device_->scal(count, Dtype(1. / num), bottom_diff); break; case HingeLossParameter_Norm_L2: - caffe_scal(count, Dtype(2. / num), bottom_diff); + this->device_->scal(count, Dtype(2. / num), bottom_diff); break; default: LOG(FATAL) << "Unknown Norm"; diff --git a/src/caffe/layers/infogain_loss_layer.cpp b/src/caffe/layers/infogain_loss_layer.cpp index a72874e4bb4..8a48d18f498 100644 --- a/src/caffe/layers/infogain_loss_layer.cpp +++ b/src/caffe/layers/infogain_loss_layer.cpp @@ -32,11 +32,11 @@ void InfogainLossLayer::FurtherSetUp( template -Dtype InfogainLossLayer::Forward_cpu(const vector*>& bottom, +Dtype InfogainLossLayer::Forward(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* bottom_label = bottom[1]->cpu_data(); - const Dtype* infogain_mat = infogain_.cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + const Dtype* bottom_label = bottom[1]->const_data(); + const Dtype* infogain_mat = infogain_.const_data(); int num = bottom[0]->num(); int dim = bottom[0]->count() / bottom[0]->num(); CHECK_EQ(infogain_.height(), dim); @@ -49,13 +49,13 @@ Dtype InfogainLossLayer::Forward_cpu(const vector*>& bottom, } } if (top->size() == 1) { - (*top)[0]->mutable_cpu_data()[0] = loss / num; + (*top)[0]->mutable_data()[0] = loss / num; } return loss / num; } template -void InfogainLossLayer::Backward_cpu(const vector*>& top, +void InfogainLossLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[1]) { @@ -63,10 +63,10 @@ void InfogainLossLayer::Backward_cpu(const vector*>& top, << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { - const Dtype* bottom_data = (*bottom)[0]->cpu_data(); - const Dtype* bottom_label = (*bottom)[1]->cpu_data(); - const Dtype* infogain_mat = infogain_.cpu_data(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + const Dtype* bottom_data = (*bottom)[0]->const_data(); + const Dtype* bottom_label = (*bottom)[1]->const_data(); + const Dtype* infogain_mat = infogain_.const_data(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); int num = (*bottom)[0]->num(); int dim = (*bottom)[0]->count() / (*bottom)[0]->num(); CHECK_EQ(infogain_.height(), dim); diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index 28dc4468031..31f3f490f7f 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -49,7 +49,7 @@ void InnerProductLayer::SetUp(const vector*>& bottom, if (bias_term_) { bias_multiplier_.reset(new SyncedMemory(M_ * sizeof(Dtype))); Dtype* bias_multiplier_data = - reinterpret_cast(bias_multiplier_->mutable_cpu_data()); + reinterpret_cast(bias_multiplier_->mutable_data()); for (int i = 0; i < M_; ++i) { bias_multiplier_data[i] = 1.; } @@ -66,7 +66,7 @@ Dtype InnerProductLayer::Forward(const vector*>& bottom, bottom_data, weight, (Dtype)0., top_data); if (bias_term_) { this->device_->gemm(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1., - reinterpret_cast(bias_multiplier_->cpu_data()), + reinterpret_cast(bias_multiplier_->const_data()), this->blobs_[1]->const_data(), (Dtype)1., top_data); } return Dtype(0); @@ -84,8 +84,8 @@ void InnerProductLayer::Backward(const vector*>& top, if (bias_term_) { // Gradient with respect to bias this->device_->gemv(CblasTrans, M_, N_, (Dtype)1., top_diff, - reinterpret_cast(bias_multiplier_->cpu_data()), (Dtype)0., - this->blobs_[1]->mutable_cpu_diff()); + reinterpret_cast(bias_multiplier_->const_data()), (Dtype)0., + this->blobs_[1]->mutable_diff()); } if (propagate_down[0]) { // Gradient with respect to bottom data diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp index a86c1d4c59d..7472e60860f 100644 --- a/src/caffe/layers/lrn_layer.cpp +++ b/src/caffe/layers/lrn_layer.cpp @@ -114,15 +114,15 @@ Dtype LRNLayer::Forward_cpu(const vector*>& bottom, template Dtype LRNLayer::CrossChannelForward_cpu( const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); - Dtype* scale_data = scale_.mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); + Dtype* scale_data = scale_.mutable_data(); // start with the constant value for (int i = 0; i < scale_.count(); ++i) { scale_data[i] = 1.; } Blob padded_square(1, channels_ + size_ - 1, height_, width_); - Dtype* padded_square_data = padded_square.mutable_cpu_data(); + Dtype* padded_square_data = padded_square.mutable_data(); memset(padded_square_data, 0, sizeof(Dtype) * padded_square.count()); Dtype alpha_over_size = alpha_ / size_; // go through the images @@ -190,17 +190,17 @@ template void LRNLayer::CrossChannelBackward_cpu( const vector*>& top, const vector& propagate_down, vector*>* bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* bottom_data = (*bottom)[0]->cpu_data(); - const Dtype* scale_data = scale_.cpu_data(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + const Dtype* top_diff = top[0]->const_diff(); + const Dtype* top_data = top[0]->const_data(); + const Dtype* bottom_data = (*bottom)[0]->const_data(); + const Dtype* scale_data = scale_.const_data(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); Blob padded_ratio(1, channels_ + size_ - 1, height_, width_); Blob accum_ratio(1, 1, height_, width_); - Dtype* padded_ratio_data = padded_ratio.mutable_cpu_data(); - Dtype* accum_ratio_data = accum_ratio.mutable_cpu_data(); + Dtype* padded_ratio_data = padded_ratio.mutable_data(); + Dtype* accum_ratio_data = accum_ratio.mutable_data(); // We hack a little bit by using the diff() to store an additional result - Dtype* accum_ratio_times_bottom = accum_ratio.mutable_cpu_diff(); + Dtype* accum_ratio_times_bottom = accum_ratio.mutable_diff(); memset(padded_ratio_data, 0, sizeof(Dtype) * padded_ratio.count()); Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_; diff --git a/src/caffe/layers/memory_data_layer.cpp b/src/caffe/layers/memory_data_layer.cpp index 15eedb317e3..cb8f1d57626 100644 --- a/src/caffe/layers/memory_data_layer.cpp +++ b/src/caffe/layers/memory_data_layer.cpp @@ -36,7 +36,7 @@ void MemoryDataLayer::Reset(Dtype* data, Dtype* labels, int n) { } template -Dtype MemoryDataLayer::Forward_cpu(const vector*>& bottom, +Dtype MemoryDataLayer::Forward(const vector*>& bottom, vector*>* top) { CHECK(data_) << "MemoryDataLayer needs to be initalized by calling Reset"; (*top)[0]->set_cpu_data(data_ + pos_ * datum_size_); diff --git a/src/caffe/layers/multinomial_logistic_loss_layer.cpp b/src/caffe/layers/multinomial_logistic_loss_layer.cpp index 013d4034240..763ba9f9d80 100644 --- a/src/caffe/layers/multinomial_logistic_loss_layer.cpp +++ b/src/caffe/layers/multinomial_logistic_loss_layer.cpp @@ -23,10 +23,10 @@ void MultinomialLogisticLossLayer::FurtherSetUp( } template -Dtype MultinomialLogisticLossLayer::Forward_cpu( +Dtype MultinomialLogisticLossLayer::Forward( const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - const Dtype* bottom_label = bottom[1]->cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + const Dtype* bottom_label = bottom[1]->const_data(); int num = bottom[0]->num(); int dim = bottom[0]->count() / bottom[0]->num(); Dtype loss = 0; @@ -36,13 +36,13 @@ Dtype MultinomialLogisticLossLayer::Forward_cpu( loss -= log(prob); } if (top->size() == 1){ - (*top)[0]->mutable_cpu_data()[0] = loss / num; + (*top)[0]->mutable_data()[0] = loss / num; } return loss / num; } template -void MultinomialLogisticLossLayer::Backward_cpu( +void MultinomialLogisticLossLayer::Backward( const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[1]) { @@ -50,9 +50,9 @@ void MultinomialLogisticLossLayer::Backward_cpu( << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { - const Dtype* bottom_data = (*bottom)[0]->cpu_data(); - const Dtype* bottom_label = (*bottom)[1]->cpu_data(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + const Dtype* bottom_data = (*bottom)[0]->const_data(); + const Dtype* bottom_label = (*bottom)[1]->const_data(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); int num = (*bottom)[0]->num(); int dim = (*bottom)[0]->count() / (*bottom)[0]->num(); memset(bottom_diff, 0, sizeof(Dtype) * (*bottom)[0]->count()); diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp index bc002078814..303a5ef4e0a 100644 --- a/src/caffe/layers/pooling_layer.cpp +++ b/src/caffe/layers/pooling_layer.cpp @@ -82,8 +82,8 @@ void PoolingLayer::SetUp(const vector*>& bottom, template Dtype PoolingLayer::Forward_cpu(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); const int top_count = (*top)[0]->count(); // We'll output the mask to top[1] if it's of size >1. const bool use_top_mask = top->size() > 1; @@ -95,10 +95,10 @@ Dtype PoolingLayer::Forward_cpu(const vector*>& bottom, case PoolingParameter_PoolMethod_MAX: // Initialize if (use_top_mask) { - top_mask = (*top)[1]->mutable_cpu_data(); + top_mask = (*top)[1]->mutable_data(); caffe_set(top_count, Dtype(-1), top_mask); } else { - mask = max_idx_->mutable_cpu_data(); + mask = max_idx_->mutable_data(); caffe_set(top_count, -1, mask); } caffe_set(top_count, Dtype(-FLT_MAX), top_data); @@ -188,8 +188,8 @@ void PoolingLayer::Backward_cpu(const vector*>& top, if (!propagate_down[0]) { return; } - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + const Dtype* top_diff = top[0]->const_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); // Different pooling methods. We explicitly do the switch outside the for // loop to save time, although this results in more codes. caffe_set((*bottom)[0]->count(), Dtype(0), bottom_diff); @@ -201,9 +201,9 @@ void PoolingLayer::Backward_cpu(const vector*>& top, case PoolingParameter_PoolMethod_MAX: // The main loop if (use_top_mask) { - top_mask = top[1]->cpu_data(); + top_mask = top[1]->const_data(); } else { - mask = max_idx_->cpu_data(); + mask = max_idx_->const_data(); } for (int n = 0; n < top[0]->num(); ++n) { for (int c = 0; c < channels_; ++c) { diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp index d7a8509b247..6efda981228 100644 --- a/src/caffe/layers/relu_layer.cpp +++ b/src/caffe/layers/relu_layer.cpp @@ -13,8 +13,8 @@ namespace caffe { template Dtype ReLULayer::Forward_cpu(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); const int count = bottom[0]->count(); for (int i = 0; i < count; ++i) { top_data[i] = max(bottom_data[i], Dtype(0)); @@ -27,9 +27,9 @@ void ReLULayer::Backward_cpu(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[0]) { - const Dtype* bottom_data = (*bottom)[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + const Dtype* bottom_data = (*bottom)[0]->const_data(); + const Dtype* top_diff = top[0]->const_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); const int count = (*bottom)[0]->count(); for (int i = 0; i < count; ++i) { bottom_diff[i] = top_diff[i] * (bottom_data[i] > 0); diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp index 627598bde12..367093964b2 100644 --- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp +++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp @@ -42,7 +42,7 @@ Dtype SigmoidCrossEntropyLossLayer::Forward( log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0))); } if (top->size() == 1) { - (*top)[0]->mutable_cpu_data()[0] = loss / num; + (*top)[0]->mutable_data()[0] = loss / num; } return loss / num; } @@ -59,9 +59,9 @@ void SigmoidCrossEntropyLossLayer::Backward( // First, compute the diff const int count = (*bottom)[0]->count(); const int num = (*bottom)[0]->num(); - const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data(); + const Dtype* sigmoid_output_data = sigmoid_output_->const_data(); const Dtype* target = (*bottom)[1]->const_data(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); this->device_->sub(count, sigmoid_output_data, target, bottom_diff); // Scale down gradient this->device_->scal(count, Dtype(1) / num, bottom_diff); diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp index 50139d863dd..8d8afd3e356 100644 --- a/src/caffe/layers/sigmoid_layer.cpp +++ b/src/caffe/layers/sigmoid_layer.cpp @@ -17,8 +17,8 @@ inline Dtype sigmoid(Dtype x) { template Dtype SigmoidLayer::Forward_cpu(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); const int count = bottom[0]->count(); for (int i = 0; i < count; ++i) { top_data[i] = sigmoid(bottom_data[i]); @@ -31,9 +31,9 @@ void SigmoidLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[0]) { - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + const Dtype* top_data = top[0]->const_data(); + const Dtype* top_diff = top[0]->const_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); const int count = (*bottom)[0]->count(); for (int i = 0; i < count; ++i) { const Dtype sigmoid_x = top_data[i]; diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp index 57847d005f6..14548c94c4d 100644 --- a/src/caffe/layers/softmax_layer.cpp +++ b/src/caffe/layers/softmax_layer.cpp @@ -19,7 +19,7 @@ void SoftmaxLayer::SetUp(const vector*>& bottom, bottom[0]->height(), bottom[0]->width()); sum_multiplier_.Reshape(1, bottom[0]->channels(), bottom[0]->height(), bottom[0]->width()); - Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data(); + Dtype* multiplier_data = sum_multiplier_.mutable_data(); for (int i = 0; i < sum_multiplier_.count(); ++i) { multiplier_data[i] = 1.; } @@ -29,9 +29,9 @@ void SoftmaxLayer::SetUp(const vector*>& bottom, template Dtype SoftmaxLayer::Forward_cpu(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); - Dtype* scale_data = scale_.mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); + Dtype* scale_data = scale_.mutable_data(); int num = bottom[0]->num(); int dim = bottom[0]->count() / bottom[0]->num(); memcpy(top_data, bottom_data, sizeof(Dtype) * bottom[0]->count()); @@ -45,12 +45,12 @@ Dtype SoftmaxLayer::Forward_cpu(const vector*>& bottom, } // subtraction caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - scale_data, sum_multiplier_.cpu_data(), 1., top_data); + scale_data, sum_multiplier_.const_data(), 1., top_data); // Perform exponentiation caffe_exp(num * dim, top_data, top_data); // sum after exp caffe_cpu_gemv(CblasNoTrans, num, dim, 1., top_data, - sum_multiplier_.cpu_data(), 0., scale_data); + sum_multiplier_.const_data(), 0., scale_data); // Do division for (int i = 0; i < num; ++i) { caffe_scal(dim, Dtype(1.) / scale_data[i], top_data + i * dim); @@ -62,10 +62,10 @@ template void SoftmaxLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, vector*>* bottom) { - const Dtype* top_diff = top[0]->cpu_diff(); - const Dtype* top_data = top[0]->cpu_data(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); - Dtype* scale_data = scale_.mutable_cpu_data(); + const Dtype* top_diff = top[0]->const_diff(); + const Dtype* top_data = top[0]->const_data(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); + Dtype* scale_data = scale_.mutable_data(); int num = top[0]->num(); int dim = top[0]->count() / top[0]->num(); memcpy(bottom_diff, top_diff, sizeof(Dtype) * top[0]->count()); @@ -76,7 +76,7 @@ void SoftmaxLayer::Backward_cpu(const vector*>& top, } // subtraction caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - scale_data, sum_multiplier_.cpu_data(), 1., bottom_diff); + scale_data, sum_multiplier_.const_data(), 1., bottom_diff); // elementwise multiplication caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff); } diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index a884505d533..bba6f196ead 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -47,7 +47,7 @@ Dtype SoftmaxWithLossLayer::Forward( Dtype(FLT_MIN))); } if (top->size() >= 1) { - (*top)[0]->mutable_cpu_data()[0] = loss / num; + (*top)[0]->mutable_data()[0] = loss / num; } if (top->size() == 2) { (*top)[1]->ShareData(prob_); @@ -56,7 +56,7 @@ Dtype SoftmaxWithLossLayer::Forward( } template -void SoftmaxWithLossLayer::Backward_cpu(const vector*>& top, +void SoftmaxWithLossLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[1]) { @@ -64,17 +64,17 @@ void SoftmaxWithLossLayer::Backward_cpu(const vector*>& top, << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); - const Dtype* prob_data = prob_.cpu_data(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); + const Dtype* prob_data = prob_.const_data(); memcpy(bottom_diff, prob_data, sizeof(Dtype) * prob_.count()); - const Dtype* label = (*bottom)[1]->cpu_data(); + const Dtype* label = (*bottom)[1]->const_data(); int num = prob_.num(); int dim = prob_.count() / num; for (int i = 0; i < num; ++i) { bottom_diff[i * dim + static_cast(label[i])] -= 1; } // Scale down gradient - this->device_scal(prob_.count(), Dtype(1) / num, bottom_diff); + this->device_->scal(prob_.count(), Dtype(1) / num, bottom_diff); } } diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp index aaa624bb7d3..fe01c5b4efb 100644 --- a/src/caffe/layers/split_layer.cpp +++ b/src/caffe/layers/split_layer.cpp @@ -36,15 +36,9 @@ Dtype SplitLayer::Forward(const vector*>& bottom, } template -<<<<<<< HEAD -void SplitLayer::Backward_cpu(const vector*>& top, +void SplitLayer::Backward(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[0]) { -======= -void SplitLayer::Backward(const vector*>& top, - const bool propagate_down, vector*>* bottom) { - if (propagate_down) { ->>>>>>> Unify the CPU/GPU Forward/Backward of the SplitLayer (*bottom)[0]->ShareDiff(*top[0]); // Add remaining top blob diffs. Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); diff --git a/src/caffe/layers/split_layer.cu b/src/caffe/layers/split_layer.cu deleted file mode 100644 index 4c921d39f17..00000000000 --- a/src/caffe/layers/split_layer.cu +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2014 BVLC and contributors. - -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -Dtype SplitLayer::Forward_gpu(const vector*>& bottom, - vector*>* top) { - for (int i = 0; i < top->size(); ++i) { - (*top)[i]->ShareData(*bottom[0]); - } - return Dtype(0.); -} - -template -void SplitLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, vector*>* bottom) { - if (propagate_down[0]) { - (*bottom)[0]->ShareDiff(*top[0]); - // Add remaining top blob diffs. - Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff(); - for (int i = 1; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff); - } - } -} - - -INSTANTIATE_CLASS(SplitLayer); - -} // namespace caffe diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp index 6b5166d53e9..8dc6ba7b06d 100644 --- a/src/caffe/layers/tanh_layer.cpp +++ b/src/caffe/layers/tanh_layer.cpp @@ -13,8 +13,8 @@ namespace caffe { template Dtype TanHLayer::Forward_cpu(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); Dtype exp2x; const int count = bottom[0]->count(); for (int i = 0; i < count; ++i) { @@ -29,9 +29,9 @@ void TanHLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, vector*>* bottom) { if (propagate_down[0]) { - const Dtype* top_data = top[0]->cpu_data(); - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + const Dtype* top_data = top[0]->const_data(); + const Dtype* top_diff = top[0]->const_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); const int count = (*bottom)[0]->count(); Dtype tanhx; for (int i = 0; i < count; ++i) { diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp index e6ed8a6b40e..47d58589064 100644 --- a/src/caffe/layers/threshold_layer.cpp +++ b/src/caffe/layers/threshold_layer.cpp @@ -18,8 +18,8 @@ void ThresholdLayer::SetUp(const vector*>& bottom, template Dtype ThresholdLayer::Forward_cpu(const vector*>& bottom, vector*>* top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = (*top)[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->const_data(); + Dtype* top_data = (*top)[0]->mutable_data(); const int count = bottom[0]->count(); for (int i = 0; i < count; ++i) { top_data[i] = (bottom_data[i] > threshold_) ? Dtype(1) : Dtype(0); diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index f364e6767c6..cf86d2fac50 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -482,20 +482,9 @@ void Net::Update() { const int count = params_[i]->count(); const Dtype* this_diff; Dtype* owner_diff; - switch (Caffe::mode()) { - case Caffe::CPU: - this_diff = params_[i]->cpu_diff(); - owner_diff = params_[param_owners_[i]]->mutable_cpu_diff(); - caffe_add(count, this_diff, owner_diff, owner_diff); - break; - case Caffe::GPU: - this_diff = params_[i]->gpu_diff(); - owner_diff = params_[param_owners_[i]]->mutable_gpu_diff(); - caffe_gpu_add(count, this_diff, owner_diff, owner_diff); - break; - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - } + this_diff = params_[i]->const_diff(); + owner_diff = params_[param_owners_[i]]->mutable_diff(); + caffe_add(count, this_diff, owner_diff, owner_diff); } // Now, update the owned parameters. for (int i = 0; i < params_.size(); ++i) { diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 769618175ac..8c7892a8924 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -153,7 +153,7 @@ void Solver::Test(const int test_net_id) { } if (i == 0) { for (int j = 0; j < result.size(); ++j) { - const Dtype* result_vec = result[j]->cpu_data(); + const Dtype* result_vec = result[j]->const_data(); for (int k = 0; k < result[j]->count(); ++k) { test_score.push_back(result_vec[k]); } @@ -161,7 +161,7 @@ void Solver::Test(const int test_net_id) { } else { int idx = 0; for (int j = 0; j < result.size(); ++j) { - const Dtype* result_vec = result[j]->cpu_data(); + const Dtype* result_vec = result[j]->const_data(); for (int k = 0; k < result[j]->count(); ++k) { test_score[idx++] += result_vec[k]; } @@ -272,51 +272,24 @@ void SGDSolver::ComputeUpdateValue() { } Dtype momentum = this->param_.momentum(); Dtype weight_decay = this->param_.weight_decay(); - switch (Caffe::mode()) { - case Caffe::CPU: - for (int param_id = 0; param_id < net_params.size(); ++param_id) { - // Compute the value to history, and then copy them to the blob's diff. - Dtype local_rate = rate * net_params_lr[param_id]; - Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->cpu_diff(), momentum, - history_[param_id]->mutable_cpu_data()); - if (local_decay) { - // add weight decay - caffe_axpy(net_params[param_id]->count(), - local_decay * local_rate, - net_params[param_id]->cpu_data(), - history_[param_id]->mutable_cpu_data()); - } - // copy - caffe_copy(net_params[param_id]->count(), - history_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - } - break; - case Caffe::GPU: - for (int param_id = 0; param_id < net_params.size(); ++param_id) { - // Compute the value to history, and then copy them to the blob's diff. - Dtype local_rate = rate * net_params_lr[param_id]; - Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; - caffe_gpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->gpu_diff(), momentum, - history_[param_id]->mutable_gpu_data()); - if (local_decay) { - // add weight decay - caffe_gpu_axpy(net_params[param_id]->count(), - local_decay * local_rate, - net_params[param_id]->gpu_data(), - history_[param_id]->mutable_gpu_data()); - } - // copy - caffe_gpu_copy(net_params[param_id]->count(), - history_[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); + for (int param_id = 0; param_id < net_params.size(); ++param_id) { + // Compute the value to history, and then copy them to the blob's diff. + Dtype local_rate = rate * net_params_lr[param_id]; + Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->const_diff(), momentum, + history_[param_id]->mutable_data()); + if (local_decay) { + // add weight decay + caffe_axpy(net_params[param_id]->count(), + local_decay * local_rate, + net_params[param_id]->const_data(), + history_[param_id]->mutable_data()); } - break; - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + // copy + caffe_copy(net_params[param_id]->count(), + history_[param_id]->const_data(), + net_params[param_id]->mutable_diff()); } } From e20f8a7353f71bb48b86b509ab02abfbc55a4570 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sat, 28 Jun 2014 09:51:16 +0800 Subject: [PATCH 74/75] Dynamically get the device to perform the math computations --- include/caffe/layer.hpp | 5 +-- src/caffe/layers/concat_layer.cpp | 10 +++-- src/caffe/layers/conv_layer.cpp | 42 +++++++++++-------- src/caffe/layers/data_layer.cpp | 4 +- src/caffe/layers/dropout_layer.cpp | 8 ++-- src/caffe/layers/eltwise_layer.cpp | 23 ++++++---- src/caffe/layers/euclidean_loss_layer.cpp | 7 ++-- src/caffe/layers/hdf5_data_layer.cpp | 4 +- src/caffe/layers/hdf5_output_layer.cpp | 4 +- src/caffe/layers/hinge_loss_layer.cpp | 19 ++++++--- src/caffe/layers/im2col_layer.cpp | 4 +- src/caffe/layers/image_data_layer.cpp | 4 +- src/caffe/layers/inner_product_layer.cpp | 19 +++++---- src/caffe/layers/power_layer.cpp | 40 +++++++++++------- .../sigmoid_cross_entropy_loss_layer.cpp | 6 ++- src/caffe/layers/softmax_loss_layer.cpp | 3 +- src/caffe/layers/split_layer.cpp | 3 +- src/caffe/layers/window_data_layer.cpp | 4 +- src/caffe/net.cpp | 3 +- src/caffe/solver.cpp | 7 ++-- 20 files changed, 128 insertions(+), 91 deletions(-) diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp index c4ba24c488e..47dddb6ad22 100644 --- a/include/caffe/layer.hpp +++ b/include/caffe/layer.hpp @@ -23,7 +23,7 @@ class Layer { // to SetUp(), where the dimensions of the bottom blobs are provided to the // layer. explicit Layer(const LayerParameter& param) - : layer_param_(param), device_(DeviceFactory::GetDevice()) { + : layer_param_(param) { // The only thing we do is to copy blobs if there are any. if (layer_param_.blobs_size() > 0) { blobs_.resize(layer_param_.blobs_size()); @@ -98,9 +98,6 @@ class Layer { LayerParameter layer_param_; // The vector that stores the parameters as a set of blobs. vector > > blobs_; - // The math backend abstracts the CPU and the GPU specific - // implementation details - Device* device_; // Forward functions: compute the layer output // (and loss layers return the loss; other layers return the dummy value 0.) diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp index a2c9aed8b94..1b90181f20a 100644 --- a/src/caffe/layers/concat_layer.cpp +++ b/src/caffe/layers/concat_layer.cpp @@ -49,7 +49,7 @@ Dtype ConcatLayer::Forward(const vector*>& bottom, for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->const_data(); int num_elem = bottom[i]->count(); - this->device_->copy(num_elem, bottom_data, + DeviceFactory::GetDevice()->copy(num_elem, bottom_data, top_data+(*top)[0]->offset(offset_num)); offset_num += bottom[i]->num(); } @@ -60,7 +60,8 @@ Dtype ConcatLayer::Forward(const vector*>& bottom, int num_elem = bottom[i]->channels()*bottom[i]->height()*bottom[i]->width(); for (int n = 0; n < num_; ++n) { - this->device_->copy(num_elem, bottom_data+bottom[i]->offset(n), + DeviceFactory::GetDevice()->copy( + num_elem, bottom_data+bottom[i]->offset(n), top_data+(*top)[0]->offset(n, offset_channel)); } offset_channel += bottom[i]->channels(); @@ -81,7 +82,7 @@ void ConcatLayer::Backward(const vector*>& top, for (int i = 0; i < bottom->size(); ++i) { Blob* blob = (*bottom)[i]; Dtype* bottom_diff = blob->mutable_diff(); - this->device_->copy(blob->count(), + DeviceFactory::GetDevice()->copy(blob->count(), top_diff+top[0]->offset(offset_num), bottom_diff); offset_num += blob->num(); } @@ -92,7 +93,8 @@ void ConcatLayer::Backward(const vector*>& top, Dtype* bottom_diff = blob->mutable_diff(); int num_elem = blob->channels()*blob->height()*blob->width(); for (int n = 0; n < num_; ++n) { - this->device_->copy(num_elem, top_diff+top[0]->offset(n, offset_channel), + DeviceFactory::GetDevice()->copy( + num_elem, top_diff+top[0]->offset(n, offset_channel), bottom_diff+blob->offset(n)); } offset_channel += blob->channels(); diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 8fba11fa2fd..f791d32affb 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -88,18 +88,20 @@ Dtype ConvolutionLayer::Forward(const vector*>& bottom, int top_offset = M_ * N_; for (int n = 0; n < num_; ++n) { // First, im2col - this->device_->im2col( + DeviceFactory::GetDevice()->im2col( bottom_data + bottom[0]->offset(n), channels_, height_, width_, kernel_size_, pad_, stride_, col_data); // Second, innerproduct with groups for (int g = 0; g < group_; ++g) { - this->device_->gemm(CblasNoTrans, CblasNoTrans, M_, N_, K_, - (Dtype)1., weight + weight_offset * g, col_data + col_offset * g, - (Dtype)0., top_data + (*top)[0]->offset(n) + top_offset * g); + DeviceFactory::GetDevice()->gemm( + CblasNoTrans, CblasNoTrans, M_, N_, K_, + (Dtype)1., weight + weight_offset * g, col_data + col_offset * g, + (Dtype)0., top_data + (*top)[0]->offset(n) + top_offset * g); } // third, add bias if (bias_term_) { - this->device_->gemm(CblasNoTrans, CblasNoTrans, num_output_, + DeviceFactory::GetDevice()->gemm( + CblasNoTrans, CblasNoTrans, num_output_, N_, 1, (Dtype)1., this->blobs_[1]->const_data(), reinterpret_cast(bias_multiplier_->const_data()), (Dtype)1., top_data + (*top)[0]->offset(n)); @@ -123,9 +125,10 @@ void ConvolutionLayer::Backward(const vector*>& top, if (bias_term_) { bias_diff = this->blobs_[1]->mutable_diff(); - this->device_->set(this->blobs_[1]->count(), 0, bias_diff); + DeviceFactory::GetDevice()->set( + this->blobs_[1]->count(), 0, bias_diff); for (int n = 0; n < num_; ++n) { - this->device_->gemv(CblasNoTrans, num_output_, N_, + DeviceFactory::GetDevice()->gemv(CblasNoTrans, num_output_, N_, 1., top_diff + top[0]->offset(n), reinterpret_cast(bias_multiplier_->const_data()), 1., bias_diff); @@ -135,30 +138,33 @@ void ConvolutionLayer::Backward(const vector*>& top, int weight_offset = M_ * K_; int col_offset = K_ * N_; int top_offset = M_ * N_; - this->device_->set(this->blobs_[0]->count(), 0, weight_diff); + DeviceFactory::GetDevice()->set( + this->blobs_[0]->count(), 0, weight_diff); for (int n = 0; n < num_; ++n) { // since we saved memory in the forward pass by not storing all col data, // we will need to recompute them. - this->device_->im2col( + DeviceFactory::GetDevice()->im2col( bottom_data + (*bottom)[0]->offset(n), channels_, height_, width_, kernel_size_, pad_, stride_, col_data); // gradient w.r.t. weight. Note that we will accumulate diffs. for (int g = 0; g < group_; ++g) { - this->device_->gemm(CblasNoTrans, CblasTrans, M_, K_, N_, - (Dtype)1., top_diff + top[0]->offset(n) + top_offset * g, - col_data + col_offset * g, (Dtype)1., - weight_diff + weight_offset * g); + DeviceFactory::GetDevice()->gemm( + CblasNoTrans, CblasTrans, M_, K_, N_, + (Dtype)1., top_diff + top[0]->offset(n) + top_offset * g, + col_data + col_offset * g, (Dtype)1., + weight_diff + weight_offset * g); } // gradient w.r.t. bottom data, if necessary if (propagate_down[0]) { for (int g = 0; g < group_; ++g) { - this->device_->gemm(CblasTrans, CblasNoTrans, K_, N_, M_, - (Dtype)1., weight + weight_offset * g, - top_diff + top[0]->offset(n) + top_offset * g, - (Dtype)0., col_diff + col_offset * g); + DeviceFactory::GetDevice()->gemm( + CblasTrans, CblasNoTrans, K_, N_, M_, + (Dtype)1., weight + weight_offset * g, + top_diff + top[0]->offset(n) + top_offset * g, + (Dtype)0., col_diff + col_offset * g); } // col2im back to the data - this->device_->col2im( + DeviceFactory::GetDevice()->col2im( col_diff, channels_, height_, width_, kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n)); } diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp index 8c785765e96..c3503d838cd 100644 --- a/src/caffe/layers/data_layer.cpp +++ b/src/caffe/layers/data_layer.cpp @@ -351,11 +351,11 @@ Dtype DataLayer::Forward(const vector*>& bottom, // First, join the thread JoinPrefetchThread(); // Copy the data - this->device_->copy_from_cpu( + DeviceFactory::GetDevice()->copy_from_cpu( prefetch_data_->count(), prefetch_data_->const_data(), (*top)[0]->mutable_data()); if (output_labels_) { - this->device_->copy_from_cpu( + DeviceFactory::GetDevice()->copy_from_cpu( prefetch_label_->count(), prefetch_label_->const_data(), (*top)[1]->mutable_data()); } diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index fa11c17ac5d..eafb74aaae7 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -35,12 +35,14 @@ Dtype DropoutLayer::Forward_cpu(const vector*>& bottom, const int count = bottom[0]->count(); if (Caffe::phase() == Caffe::TRAIN) { // Create random numbers - caffe_rng_bernoulli(count, 1. - threshold_, mask); + DeviceFactory::GetDevice()->rng_bernoulli(count, 1. - threshold_, + mask); for (int i = 0; i < count; ++i) { top_data[i] = bottom_data[i] * mask[i] * scale_; } } else { - this->device_->copy(bottom[0]->count(), bottom_data, top_data); + DeviceFactory::GetDevice()->copy(bottom[0]->count(), bottom_data, + top_data); } return Dtype(0); } @@ -61,8 +63,6 @@ void DropoutLayer::Backward_cpu(const vector*>& top, } } - INSTANTIATE_CLASS(DropoutLayer); - } // namespace caffe diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp index 7edfe3720c7..adf879e6842 100644 --- a/src/caffe/layers/eltwise_layer.cpp +++ b/src/caffe/layers/eltwise_layer.cpp @@ -47,18 +47,19 @@ Dtype EltwiseLayer::Forward( Dtype* top_data = (*top)[0]->mutable_data(); switch (op_) { case EltwiseParameter_EltwiseOp_PROD: - this->device_->mul(count, bottom[0]->const_data(), + DeviceFactory::GetDevice()->mul(count, bottom[0]->const_data(), bottom[1]->const_data(), top_data); for (int i = 2; i < bottom.size(); ++i) { - this->device_->mul(count, top_data, bottom[i]->const_data(), top_data); + DeviceFactory::GetDevice()->mul( + count, top_data, bottom[i]->const_data(), top_data); } break; case EltwiseParameter_EltwiseOp_SUM: - this->device_->set(count, Dtype(0), top_data); + DeviceFactory::GetDevice()->set(count, Dtype(0), top_data); // TODO(shelhamer) does BLAS optimize to sum for coeff = 1? for (int i = 0; i < bottom.size(); ++i) { - this->device_->axpy(count, coeffs_[i], bottom[i]->const_data(), - top_data); + DeviceFactory::GetDevice()->axpy( + count, coeffs_[i], bottom[i]->const_data(), top_data); } break; default: @@ -79,14 +80,18 @@ void EltwiseLayer::Backward(const vector*>& top, Dtype* bottom_diff = (*bottom)[i]->mutable_diff(); switch (op_) { case EltwiseParameter_EltwiseOp_PROD: - this->device_->div(count, top_data, bottom_data, bottom_diff); - this->device_->mul(count, bottom_diff, top_diff, bottom_diff); + DeviceFactory::GetDevice()->div( + count, top_data, bottom_data, bottom_diff); + DeviceFactory::GetDevice()->mul( + count, bottom_diff, top_diff, bottom_diff); break; case EltwiseParameter_EltwiseOp_SUM: if (coeffs_[i] == Dtype(1)) { - this->device_->copy(count, top_diff, bottom_diff); + DeviceFactory::GetDevice()->copy( + count, top_diff, bottom_diff); } else { - this->device_->scale(count, coeffs_[i], top_diff, bottom_diff); + DeviceFactory::GetDevice()->scale( + count, coeffs_[i], top_diff, bottom_diff); } break; default: diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp index 2c1dbe5c101..a88ec8ea3cc 100644 --- a/src/caffe/layers/euclidean_loss_layer.cpp +++ b/src/caffe/layers/euclidean_loss_layer.cpp @@ -23,13 +23,14 @@ template Dtype EuclideanLossLayer::Forward(const vector*>& bottom, vector*>* top) { int count = bottom[0]->count(); - this->device_->sub( + DeviceFactory::GetDevice()->sub( count, bottom[0]->const_data(), bottom[1]->const_data(), diff_.mutable_data()); Dtype dot; - this->device_->dot(count, diff_.const_data(), diff_.const_data(), &dot); + DeviceFactory::GetDevice()->dot(count, diff_.const_data(), + diff_.const_data(), &dot); Dtype loss = dot / bottom[0]->num() / Dtype(2); if (top->size() == 1) { (*top)[0]->mutable_data()[0] = loss; @@ -43,7 +44,7 @@ void EuclideanLossLayer::Backward(const vector*>& top, for (int i = 0; i < 2; ++i) { if (propagate_down[i]) { const Dtype sign = (i == 0) ? 1 : -1; - this->device_->axpby( + DeviceFactory::GetDevice()->axpby( (*bottom)[i]->count(), // count sign / (*bottom)[i]->num(), // alpha diff_.const_data(), // a diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index cf8dff3507a..02121512f16 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -104,10 +104,10 @@ Dtype HDF5DataLayer::Forward(const vector*>& bottom, } current_row_ = 0; } - this->device_->copy_from_cpu( + DeviceFactory::GetDevice()->copy_from_cpu( data_count, &data_blob_.const_data()[current_row_ * data_count], &(*top)[0]->mutable_data()[i * data_count]); - this->device_->copy_from_cpu( + DeviceFactory::GetDevice()->copy_from_cpu( label_data_count, &label_blob_.const_data()[current_row_ * label_data_count], &(*top)[1]->mutable_data()[i * label_data_count]); diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp index 0601cd9f473..cc87e777f81 100644 --- a/src/caffe/layers/hdf5_output_layer.cpp +++ b/src/caffe/layers/hdf5_output_layer.cpp @@ -54,10 +54,10 @@ Dtype HDF5OutputLayer::Forward(const vector*>& bottom, const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); for (int i = 0; i < bottom[0]->num(); ++i) { - this->device_->copy_from_cpu( + DeviceFactory::GetDevice()->copy_from_cpu( data_datum_dim, &bottom[0]->const_data()[i * data_datum_dim], &data_blob_.mutable_data()[i * data_datum_dim]); - this->device_->copy_from_cpu( + DeviceFactory::GetDevice()->copy_from_cpu( label_datum_dim, &bottom[1]->const_data()[i * label_datum_dim], &label_blob_.mutable_data()[i * label_datum_dim]); } diff --git a/src/caffe/layers/hinge_loss_layer.cpp b/src/caffe/layers/hinge_loss_layer.cpp index 528b111bba4..221b4449360 100644 --- a/src/caffe/layers/hinge_loss_layer.cpp +++ b/src/caffe/layers/hinge_loss_layer.cpp @@ -24,7 +24,7 @@ Dtype HingeLossLayer::Forward(const vector*>& bottom, int count = bottom[0]->count(); int dim = count / num; - this->device_->copy(count, bottom_data, bottom_diff); + DeviceFactory::GetDevice()->copy(count, bottom_data, bottom_diff); for (int i = 0; i < num; ++i) { bottom_diff[i * dim + static_cast(label[i])] *= -1; } @@ -35,9 +35,14 @@ Dtype HingeLossLayer::Forward(const vector*>& bottom, } switch (this->layer_param_.hinge_loss_param().norm()) { case HingeLossParameter_Norm_L1: - return caffe_cpu_asum(count, bottom_diff) / num; + Dtype sum; + DeviceFactory::GetDevice()->asum(count, bottom_diff, &sum); + return sum / num; case HingeLossParameter_Norm_L2: - return caffe_cpu_dot(count, bottom_diff, bottom_diff) / num; + Dtype dot; + DeviceFactory::GetDevice()->dot(count, bottom_diff, + bottom_diff, &dot); + return dot / num; default: LOG(FATAL) << "Unknown Norm"; } @@ -63,11 +68,13 @@ void HingeLossLayer::Backward(const vector*>& top, switch (this->layer_param_.hinge_loss_param().norm()) { case HingeLossParameter_Norm_L1: - this->device_->sign(count, bottom_diff, bottom_diff); - this->device_->scal(count, Dtype(1. / num), bottom_diff); + DeviceFactory::GetDevice()->sign(count, bottom_diff, bottom_diff); + DeviceFactory::GetDevice()->scal(count, Dtype(1. / num), + bottom_diff); break; case HingeLossParameter_Norm_L2: - this->device_->scal(count, Dtype(2. / num), bottom_diff); + DeviceFactory::GetDevice()->scal(count, Dtype(2. / num), + bottom_diff); break; default: LOG(FATAL) << "Unknown Norm"; diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp index 26585e67ddc..13048829796 100644 --- a/src/caffe/layers/im2col_layer.cpp +++ b/src/caffe/layers/im2col_layer.cpp @@ -30,7 +30,7 @@ Dtype Im2colLayer::Forward(const vector*>& bottom, const Dtype* bottom_data = bottom[0]->const_data(); Dtype* top_data = (*top)[0]->mutable_data(); for (int n = 0; n < bottom[0]->num(); ++n) { - this->device_->im2col( + DeviceFactory::GetDevice()->im2col( bottom_data + bottom[0]->offset(n), channels_, height_, width_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n)); } @@ -43,7 +43,7 @@ void Im2colLayer::Backward(const vector*>& top, const Dtype* top_diff = top[0]->const_diff(); Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); for (int n = 0; n < top[0]->num(); ++n) { - this->device_->col2im( + DeviceFactory::GetDevice()->col2im( top_diff + top[0]->offset(n), channels_, height_, width_, kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n)); } diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp index 774add7155c..a890090efa6 100644 --- a/src/caffe/layers/image_data_layer.cpp +++ b/src/caffe/layers/image_data_layer.cpp @@ -278,10 +278,10 @@ Dtype ImageDataLayer::Forward(const vector*>& bottom, // First, join the thread JoinPrefetchThread(); // Copy the data - this->device_->copy_from_cpu( + DeviceFactory::GetDevice()->copy_from_cpu( prefetch_data_->count(), prefetch_data_->const_data(), (*top)[0]->mutable_data()); - this->device_->copy_from_cpu( + DeviceFactory::GetDevice()->copy_from_cpu( prefetch_label_->count(), prefetch_label_->const_data(), (*top)[1]->mutable_data()); // Start a new prefetch thread diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index 31f3f490f7f..08278c538da 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -62,10 +62,12 @@ Dtype InnerProductLayer::Forward(const vector*>& bottom, const Dtype* bottom_data = bottom[0]->const_data(); Dtype* top_data = (*top)[0]->mutable_data(); const Dtype* weight = this->blobs_[0]->const_data(); - this->device_->gemm(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1., + DeviceFactory::GetDevice()->gemm( + CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1., bottom_data, weight, (Dtype)0., top_data); if (bias_term_) { - this->device_->gemm(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1., + DeviceFactory::GetDevice()->gemm( + CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1., reinterpret_cast(bias_multiplier_->const_data()), this->blobs_[1]->const_data(), (Dtype)1., top_data); } @@ -79,17 +81,20 @@ void InnerProductLayer::Backward(const vector*>& top, const Dtype* top_diff = top[0]->const_diff(); const Dtype* bottom_data = (*bottom)[0]->const_data(); // Gradient with respect to weight - this->device_->gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., + DeviceFactory::GetDevice()->gemm( + CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_diff()); if (bias_term_) { // Gradient with respect to bias - this->device_->gemv(CblasTrans, M_, N_, (Dtype)1., top_diff, - reinterpret_cast(bias_multiplier_->const_data()), (Dtype)0., - this->blobs_[1]->mutable_diff()); + DeviceFactory::GetDevice()->gemv( + CblasTrans, M_, N_, (Dtype)1., top_diff, + reinterpret_cast(bias_multiplier_->const_data()), + (Dtype)0., this->blobs_[1]->mutable_diff()); } if (propagate_down[0]) { // Gradient with respect to bottom data - this->device_->gemm(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., + DeviceFactory::GetDevice()->gemm( + CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., top_diff, this->blobs_[0]->const_data(), (Dtype)0., (*bottom)[0]->mutable_diff()); } diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp index f596cd83d06..766214ecf01 100644 --- a/src/caffe/layers/power_layer.cpp +++ b/src/caffe/layers/power_layer.cpp @@ -30,19 +30,19 @@ Dtype PowerLayer::Forward(const vector*>& bottom, // Special case where we can ignore the input: scale or power is 0. if (diff_scale_ == Dtype(0)) { Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); - this->device_->set(count, value, top_data); + DeviceFactory::GetDevice()->set(count, value, top_data); return Dtype(0); } const Dtype* bottom_data = bottom[0]->const_data(); - this->device_->copy(count, bottom_data, top_data); + DeviceFactory::GetDevice()->copy(count, bottom_data, top_data); if (scale_ != Dtype(1)) { - this->device_->scal(count, scale_, top_data); + DeviceFactory::GetDevice()->scal(count, scale_, top_data); } if (shift_ != Dtype(0)) { - this->device_->add_scalar(count, shift_, top_data); + DeviceFactory::GetDevice()->add_scalar(count, shift_, top_data); } if (power_ != Dtype(1)) { - this->device_->powx(count, top_data, power_, top_data); + DeviceFactory::GetDevice()->powx(count, top_data, power_, top_data); } return Dtype(0); } @@ -56,7 +56,7 @@ void PowerLayer::Backward(const vector*>& top, const int count = (*bottom)[0]->count(); const Dtype* top_diff = top[0]->const_diff(); if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { - this->device_->set(count, diff_scale_, bottom_diff); + DeviceFactory::GetDevice()->set(count, diff_scale_, bottom_diff); } else { const Dtype* bottom_data = (*bottom)[0]->const_data(); // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) @@ -65,10 +65,12 @@ void PowerLayer::Backward(const vector*>& top, // Special case for y = (shift + scale * x)^2 // -> dy/dx = 2 * scale * (shift + scale * x) // = diff_scale * shift + diff_scale * scale * x - this->device_->axpby(count, diff_scale_ * scale_, bottom_data, + DeviceFactory::GetDevice()->axpby( + count, diff_scale_ * scale_, bottom_data, Dtype(0), bottom_diff); if (shift_ != Dtype(0)) { - this->device_->add_scalar(count, diff_scale_ * shift_, bottom_diff); + DeviceFactory::GetDevice()->add_scalar( + count, diff_scale_ * shift_, bottom_diff); } } else if (shift_ == Dtype(0)) { // Special case for y = (scale * x)^power @@ -76,25 +78,31 @@ void PowerLayer::Backward(const vector*>& top, // = scale * power * (scale * x)^power * (scale * x)^(-1) // = power * y / x const Dtype* top_data = top[0]->const_data(); - this->device_->div(count, top_data, bottom_data, bottom_diff); - this->device_->scal(count, power_, bottom_diff); + DeviceFactory::GetDevice()->div( + count, top_data, bottom_data, bottom_diff); + DeviceFactory::GetDevice()->scal(count, power_, bottom_diff); } else { - this->device_->copy(count, bottom_data, bottom_diff); + DeviceFactory::GetDevice()->copy(count, bottom_data, + bottom_diff); if (scale_ != Dtype(1)) { - this->device_->scal(count, scale_, bottom_diff); + DeviceFactory::GetDevice()->scal(count, scale_, bottom_diff); } if (shift_ != Dtype(0)) { - this->device_->add_scalar(count, shift_, bottom_diff); + DeviceFactory::GetDevice()->add_scalar(count, shift_, + bottom_diff); } const Dtype* top_data = top[0]->const_data(); - this->device_->div(count, top_data, bottom_diff, bottom_diff); + DeviceFactory::GetDevice()->div(count, top_data, bottom_diff, + bottom_diff); if (diff_scale_ != Dtype(1)) { - this->device_->scal(count, diff_scale_, bottom_diff); + DeviceFactory::GetDevice()->scal(count, diff_scale_, + bottom_diff); } } } if (diff_scale_ != Dtype(0)) { - this->device_->mul(count, top_diff, bottom_diff, bottom_diff); + DeviceFactory::GetDevice()->mul(count, top_diff, bottom_diff, + bottom_diff); } } } diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp index 367093964b2..fcc3a19a14d 100644 --- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp +++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp @@ -62,9 +62,11 @@ void SigmoidCrossEntropyLossLayer::Backward( const Dtype* sigmoid_output_data = sigmoid_output_->const_data(); const Dtype* target = (*bottom)[1]->const_data(); Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); - this->device_->sub(count, sigmoid_output_data, target, bottom_diff); + DeviceFactory::GetDevice()->sub( + count, sigmoid_output_data, target, bottom_diff); // Scale down gradient - this->device_->scal(count, Dtype(1) / num, bottom_diff); + DeviceFactory::GetDevice()->scal(count, Dtype(1) / num, + bottom_diff); } } diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index bba6f196ead..19c04f874aa 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -74,7 +74,8 @@ void SoftmaxWithLossLayer::Backward(const vector*>& top, bottom_diff[i * dim + static_cast(label[i])] -= 1; } // Scale down gradient - this->device_->scal(prob_.count(), Dtype(1) / num, bottom_diff); + DeviceFactory::GetDevice()->scal(prob_.count(), Dtype(1) / num, + bottom_diff); } } diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp index fe01c5b4efb..f90ea678b95 100644 --- a/src/caffe/layers/split_layer.cpp +++ b/src/caffe/layers/split_layer.cpp @@ -44,7 +44,8 @@ void SplitLayer::Backward(const vector*>& top, Dtype* bottom_diff = (*bottom)[0]->mutable_diff(); for (int i = 1; i < top.size(); ++i) { const Dtype* top_diff = top[i]->const_diff(); - this->device_->axpy(count_, Dtype(1.), top_diff, bottom_diff); + DeviceFactory::GetDevice()->axpy( + count_, Dtype(1.), top_diff, bottom_diff); } } } diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp index 5817ab532bd..a068ff008b4 100644 --- a/src/caffe/layers/window_data_layer.cpp +++ b/src/caffe/layers/window_data_layer.cpp @@ -444,10 +444,10 @@ Dtype WindowDataLayer::Forward(const vector*>& bottom, // First, join the thread JoinPrefetchThread(); // Copy the data - this->device_->copy_from_cpu( + DeviceFactory::GetDevice()->copy_from_cpu( prefetch_data_->count(), prefetch_data_->const_data(), (*top)[0]->mutable_data()); - this->device_->copy_from_cpu( + DeviceFactory::GetDevice()->copy_from_cpu( prefetch_label_->count(), prefetch_label_->const_data(), (*top)[1]->mutable_data()); // Start a new prefetch thread diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index cf86d2fac50..30c823f843a 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -484,7 +484,8 @@ void Net::Update() { Dtype* owner_diff; this_diff = params_[i]->const_diff(); owner_diff = params_[param_owners_[i]]->mutable_diff(); - caffe_add(count, this_diff, owner_diff, owner_diff); + DeviceFactory::GetDevice()->add(count, this_diff, owner_diff, + owner_diff); } // Now, update the owned parameters. for (int i = 0; i < params_.size(); ++i) { diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 8c7892a8924..3642bf9b40e 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -276,18 +276,19 @@ void SGDSolver::ComputeUpdateValue() { // Compute the value to history, and then copy them to the blob's diff. Dtype local_rate = rate * net_params_lr[param_id]; Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + DeviceFactory::GetDevice()->axpby( + net_params[param_id]->count(), local_rate, net_params[param_id]->const_diff(), momentum, history_[param_id]->mutable_data()); if (local_decay) { // add weight decay - caffe_axpy(net_params[param_id]->count(), + DeviceFactory::GetDevice()->axpy(net_params[param_id]->count(), local_decay * local_rate, net_params[param_id]->const_data(), history_[param_id]->mutable_data()); } // copy - caffe_copy(net_params[param_id]->count(), + DeviceFactory::GetDevice()->copy(net_params[param_id]->count(), history_[param_id]->const_data(), net_params[param_id]->mutable_diff()); } From 8f868c1b91b8cfb5734df1dc8f6649f049bfe81d Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sat, 28 Jun 2014 10:08:53 +0800 Subject: [PATCH 75/75] Implement device independent data getters for the SyncedMemory --- include/caffe/syncedmem.hpp | 7 +++++-- src/caffe/blob.cpp | 4 ++-- src/caffe/syncedmem.cpp | 24 ++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp index ebe29542a17..30e7d771ad4 100644 --- a/include/caffe/syncedmem.hpp +++ b/include/caffe/syncedmem.hpp @@ -49,8 +49,8 @@ class AbstractSyncedMemory { virtual SyncedHead head() { return head_; } virtual size_t size() { return size_; } - const void* const_data() const { return NULL; } - void* mutable_data() { return NULL;} + virtual const void* const_data() { return NULL; } + virtual void* mutable_data() { return NULL;} protected: virtual void to_cpu() = 0; @@ -75,6 +75,9 @@ class SyncedMemory : public AbstractSyncedMemory { void* mutable_cpu_data(); void* mutable_gpu_data(); + const void* const_data(); + void* mutable_data(); + protected: void to_cpu(); void to_gpu(); diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp index 4e0a14160cc..64d77bdbfeb 100644 --- a/src/caffe/blob.cpp +++ b/src/caffe/blob.cpp @@ -131,7 +131,7 @@ Dtype* Blob::mutable_data() { return mutable_gpu_data(); default: LOG(FATAL) << "Unknown caffe mode."; - return static_cast(0); + return static_cast(NULL); } } @@ -144,7 +144,7 @@ Dtype* Blob::mutable_diff() { return mutable_gpu_diff(); default: LOG(FATAL) << "Unknown caffe mode."; - return static_cast(0); + return static_cast(NULL); } } diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index a38ff8d9028..76c99bcf9bc 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -93,5 +93,29 @@ void* SyncedMemory::mutable_gpu_data() { return gpu_ptr_; } +const void* SyncedMemory::const_data() { + switch (Caffe::mode()) { + case Caffe::CPU: + return cpu_data(); + case Caffe::GPU: + return gpu_data(); + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast(NULL); + } +} + +void* SyncedMemory::mutable_data() { + switch (Caffe::mode()) { + case Caffe::CPU: + return mutable_cpu_data(); + case Caffe::GPU: + return mutable_gpu_data(); + default: + LOG(FATAL) << "Unknown caffe mode."; + return static_cast(NULL); + } +} + } // namespace caffe