From 4e936f7c8dbae5b6e33c9aad8e162e3edb02702d Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Tue, 1 Dec 2015 21:48:27 -0800
Subject: [PATCH 1/8] Fixed large allocations cache

---
 3rdparty/cub/cub/util_allocator.cuh | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/3rdparty/cub/cub/util_allocator.cuh b/3rdparty/cub/cub/util_allocator.cuh
index 7f6bebe7f74..e0e0e4562eb 100644
--- a/3rdparty/cub/cub/util_allocator.cuh
+++ b/3rdparty/cub/cub/util_allocator.cuh
@@ -407,6 +407,11 @@ struct CachingDeviceAllocator
         while ( (block_itr != cached_blocks.end())
                 && (block_itr->device == device)
                 && (block_itr->bin == search_key.bin)) {
+
+          // use special rule for the last ("exact size") bin: set max memory overuse to 1/8th
+          if (search_key.bin == (unsigned int) -1 && (block_itr->bytes - search_key.bytes)*8UL > search_key.bytes)
+            break;
+
           cudaStream_t prev_stream = block_itr->associated_stream;
 	  if ((active_stream == prev_stream)
 	      || (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
@@ -460,11 +465,11 @@ struct CachingDeviceAllocator
 	      error = cudaMalloc(&search_key.d_ptr, search_key.bytes);
 	      //	      }
 	    }
-	    if (CubDebug(error)) 
+	    if (CubDebug(error))
 	      return error;
-	    if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) 
+	    if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
 	      return error;
-	    
+
 	    // Insert into live blocks
 	    Lock(&spin_lock);
 	    live_blocks.insert(search_key);
@@ -543,30 +548,30 @@ struct CachingDeviceAllocator
 
 	// Lock
 	Lock(&spin_lock);
-	
+
 	// Find corresponding block descriptor
 	BusyBlocks::iterator block_itr = live_blocks.find(search_key);
 	if (block_itr != live_blocks.end()) {
 	  // Remove from live blocks
 	  search_key = *block_itr;
-	  live_blocks.erase(block_itr);	    
+	  live_blocks.erase(block_itr);
 	  cached_bytes[device].busy -= search_key.bytes;
-	  
+
 	  // Check if we should keep the returned allocation
 	  if (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)
-	    {	      
+	    {
 	      // Insert returned allocation into free blocks
 	      cached_blocks.insert(search_key);
 	      cached_bytes[device].free += search_key.bytes;
 	      recached = true;
-	      if (debug) { 
+	      if (debug) {
 		CubLog("\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
-		       device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), 
+		       device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
 		       (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy);
 	      }
 	    }
 	}
-	
+
         Unlock(&spin_lock);
 
         if (recached) {
@@ -580,7 +585,7 @@ struct CachingDeviceAllocator
 	      return error;
 	    if (CubDebug(error = cudaSetDevice(device))) return error;
 	  }
-	  
+
           // Actually free device memory
           if (CubDebug(error = cudaFree(d_ptr))) return error;
           if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;

From b61ebf0eec6ab8c423d2cc00973886d14b776322 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Tue, 1 Dec 2015 23:42:00 -0800
Subject: [PATCH 2/8] Fixing GPU arena setup

---
 include/caffe/util/gpu_memory.hpp | 14 +++++++---
 src/caffe/util/gpu_memory.cpp     | 46 +++++++++++++++++--------------
 2 files changed, 36 insertions(+), 24 deletions(-)

diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp
index b1a6c919cf7..c57d98a78be 100644
--- a/include/caffe/util/gpu_memory.hpp
+++ b/include/caffe/util/gpu_memory.hpp
@@ -83,11 +83,17 @@ class gpu_memory {
   static void init(const std::vector<int>&, PoolMode, bool);
   static void destroy();
 
-  static bool     initialized_;
-  static PoolMode mode_;
-  static size_t   poolsize_;
-  static bool     debug_;
+  static bool             initialized_;
+  static PoolMode         mode_;
+  static bool             debug_;
+
 #ifndef CPU_ONLY
+  struct MemInfo {
+      size_t free;
+      size_t total;
+  };
+
+  static vector<MemInfo>  dev_info_;
 
  public:
   typedef void* pointer;
diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp
index 5dd312153cc..30e404e5f1b 100644
--- a/src/caffe/util/gpu_memory.cpp
+++ b/src/caffe/util/gpu_memory.cpp
@@ -12,10 +12,10 @@ namespace caffe {
 
 #ifndef CPU_ONLY  // CPU-only Caffe.
   static cub::CachingDeviceAllocator* cubAlloc = 0;
+  vector<gpu_memory::MemInfo> gpu_memory::dev_info_;
 #endif
 
   gpu_memory::PoolMode gpu_memory::mode_   = gpu_memory::NoPool;
-  size_t               gpu_memory::poolsize_ = 0;
   bool                 gpu_memory::debug_ = false;
 
 #ifdef CPU_ONLY  // CPU-only Caffe.
@@ -43,8 +43,7 @@ namespace caffe {
     }
     if (debug)
       std::cout << "gpu_memory initialized with "
-                << getPoolName() << ". Poolsize = "
-                << (1.0*poolsize_)/(1024.0*1024.0*1024.0) << " G." << std::endl;
+                << getPoolName()  << std::endl;
   }
 
   void gpu_memory::destroy() {
@@ -100,39 +99,44 @@ namespace caffe {
 
     CUDA_CHECK(cudaGetDevice(&initial_device));
 
+
     for (int i = 0; i < gpus.size(); i++) {
+      int cur_device = gpus[i];
+      if (cur_device+1 > dev_info_.size())
+        dev_info_.resize(cur_device+1);
+
       CUDA_CHECK(cudaSetDevice(gpus[i]));
-      size_t free_mem, total_mem;
       cudaDeviceProp props;
-      CUDA_CHECK(cudaGetDeviceProperties(&props, gpus[i]));
-      CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem));
+      CUDA_CHECK(cudaGetDeviceProperties(&props, cur_device));
+      CUDA_CHECK(cudaMemGetInfo(&dev_info_[cur_device].free,
+                                &dev_info_[cur_device].total));
 
       if (debug_) {
         std::cout << "cudaGetDeviceProperties: Mem = "
-                  << props.totalGlobalMem <<std:: endl;
-        std::cout << "cudaMemGetInfo: Free= " << free_mem
-                  << " Total= " << total_mem << std::endl;
+                  << props.totalGlobalMem <<std::endl;
+        std::cout << "cudaMemGetInfo_[" << cur_device
+                  <<": Free= " << dev_info_[cur_device].free
+                  << " Total= " << dev_info_[cur_device].total << std::endl;
       }
 
       // make sure we don't ask for more that total device memory
-      free_mem = std::min(total_mem, free_mem);
-      free_mem = size_t(0.95*std::min(props.totalGlobalMem, free_mem));
-      // find out the smallest GPU size
-      if (poolsize_ == 0 || poolsize_ > free_mem)
-        poolsize_ = free_mem;
-    }
+      dev_info_[i].free = std::min(dev_info_[cur_device].total,
+                                   dev_info_[cur_device].free);
+      dev_info_[i].free = std::min(props.totalGlobalMem,
+                                   dev_info_[cur_device].free);
+     }
 
 
     switch ( mode_ ) {
       case CubPool:
         try {
-          // if you are paranoid, that doesn't mean they are not after you :)
+          // just in case someone installed 'no cleanup' arena before
           delete cubAlloc;
 
-          cubAlloc = new cub::CachingDeviceAllocator( 2,   // defaults
+          cubAlloc = new cub::CachingDeviceAllocator( 2,
                                                       6,
                                                       16,
-                                                      poolsize_,
+                                                      (size_t)-1,
                                                       false,
                                                       debug_);
         }
@@ -160,13 +164,15 @@ namespace caffe {
     case CubPool:
       int cur_device;
       CUDA_CHECK(cudaGetDevice(&cur_device));
-      *total_mem = poolsize_;
+      *total_mem = dev_info_[cur_device].total;
       // Free memory is initial free memory minus outstanding allocations.
       // Assuming we only allocate via gpu_memory since its constructon.
-      *free_mem = poolsize_ - cubAlloc->cached_bytes[cur_device].busy;
+      *free_mem = dev_info_[cur_device].free -
+        cubAlloc->cached_bytes[cur_device].busy;
       break;
     default:
       CUDA_CHECK(cudaMemGetInfo(free_mem, total_mem));
+      break;
     }
   }
 #endif  // CPU_ONLY

From 37d8e766cef37dd07fb3c35a1ad7c0b5a70913a3 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 2 Dec 2015 12:50:37 -0800
Subject: [PATCH 3/8] Fixed debug print

---
 3rdparty/cub/cub/util_allocator.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/cub/cub/util_allocator.cuh b/3rdparty/cub/cub/util_allocator.cuh
index e0e0e4562eb..ff19cad55ce 100644
--- a/3rdparty/cub/cub/util_allocator.cuh
+++ b/3rdparty/cub/cub/util_allocator.cuh
@@ -667,7 +667,7 @@ struct CachingDeviceAllocator
             cached_blocks.erase(begin);
 
             if (debug) CubLog("\tdevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-                              current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].free);
+                              current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].busy);
         }
 
         Unlock(&spin_lock);

From 1c678452b9034aa891850f749e00d6df6cb044b8 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 2 Dec 2015 15:23:03 -0800
Subject: [PATCH 4/8] d

---
 tools/caffe.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index f6dbb89ec2d..306a5a55d34 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -407,7 +407,7 @@ int main(int argc, char** argv) {
   // initialize gpu memory arena
   vector<int> gpus;
   get_gpus(&gpus);
-  caffe::gpu_memory::arena arena(gpus, caffe::gpu_memory::DefaultPool, true);
+  caffe::gpu_memory::arena arena(gpus);
 
   if (argc == 2) {
 #ifdef WITH_PYTHON_LAYER

From 81baeda1ff2b2e4bf28d98b0f3853fde7f603ac8 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 2 Dec 2015 21:21:23 -0800
Subject: [PATCH 5/8] adding environment switch for memory diagnostic

---
 src/caffe/util/gpu_memory.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp
index 30e404e5f1b..b8761a7a2a2 100644
--- a/src/caffe/util/gpu_memory.cpp
+++ b/src/caffe/util/gpu_memory.cpp
@@ -26,9 +26,13 @@ namespace caffe {
     return "No GPU: CPU Only Memory";
   }
 #else
+
   void gpu_memory::init(const std::vector<int>& gpus,
                         PoolMode m, bool debug) {
-    debug_ = debug;
+
+    bool debug_env = (getenv("DEBUG_GPU_MEM")!=0);
+    debug_ = debug || debug_env;
+
     if (gpus.size() <= 0) {
       // should we report an error here ?
       m = gpu_memory::NoPool;

From 3fc98dd71c6ef13d4ae0995663f9f8a7023f6357 Mon Sep 17 00:00:00 2001
From: Luke Yeager <luke.yeager@gmail.com>
Date: Thu, 3 Dec 2015 13:28:36 -0800
Subject: [PATCH 6/8] Mark v0.14.0-rc.3

---
 CMakeLists.txt | 2 +-
 Makefile       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5db96196761..f932ef909fe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,7 @@ cmake_minimum_required(VERSION 2.8.7)
 project(Caffe C CXX)
 
 # ---[ Caffe version
-set(CAFFE_TARGET_VERSION "0.14.0-rc.2")
+set(CAFFE_TARGET_VERSION "0.14.0-rc.3")
 set(CAFFE_TARGET_SOVERSION "0.14")
 add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION})
 
diff --git a/Makefile b/Makefile
index 212488110d7..e2538d577a7 100644
--- a/Makefile
+++ b/Makefile
@@ -43,7 +43,7 @@ LIB_BUILD_DIR := $(BUILD_DIR)/lib
 STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a
 DYNAMIC_VERSION_MAJOR 		:= 0
 DYNAMIC_VERSION_MINOR 		:= 14
-DYNAMIC_VERSION_REVISION 	:= 0-rc.2
+DYNAMIC_VERSION_REVISION 	:= 0-rc.3
 DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so
 DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR)
 DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_SONAME_SHORT).$(DYNAMIC_VERSION_REVISION)

From 8a2ab8e098dc87ad0dd7fb2b76bf83bc3fdacda5 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 13 Jan 2016 14:18:13 -0800
Subject: [PATCH 7/8] gpu_memory cleanup, fixes possible init issue

---
 3rdparty/cub/cub/util_allocator.cuh | 10 ---
 Makefile                            |  2 +-
 include/caffe/util/gpu_memory.hpp   | 11 +++-
 src/caffe/util/gpu_memory.cpp       | 95 ++++++++++++++++++-----------
 4 files changed, 68 insertions(+), 50 deletions(-)

diff --git a/3rdparty/cub/cub/util_allocator.cuh b/3rdparty/cub/cub/util_allocator.cuh
index ff19cad55ce..42b020c9b1c 100644
--- a/3rdparty/cub/cub/util_allocator.cuh
+++ b/3rdparty/cub/cub/util_allocator.cuh
@@ -454,16 +454,6 @@ struct CachingDeviceAllocator
 	    if (error != cudaSuccess) {
 	      if (debug) CubLog("\tdevice %d failed to allocate %lld bytes for stream %lld",
 				device, (long long) search_key.bytes, (long long) search_key.associated_stream);
-
-	      // if (search_key.bytes < cached_bytes[device]) {
-	      // free all cached memory (for all devices), synchrionize and retry once
-	      cudaDeviceSynchronize();
-	      cudaThreadSynchronize();
-	      FreeAllCached();
-	      cudaDeviceSynchronize();
-	      cudaThreadSynchronize();
-	      error = cudaMalloc(&search_key.d_ptr, search_key.bytes);
-	      //	      }
 	    }
 	    if (CubDebug(error))
 	      return error;
diff --git a/Makefile b/Makefile
index e2538d577a7..65703758574 100644
--- a/Makefile
+++ b/Makefile
@@ -571,7 +571,7 @@ $(DYNAMIC_NAME): $(OBJS)| $(LIB_BUILD_DIR)
 	@ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_SONAME_SHORT); ln -s $(DYNAMIC_VERSIONED_NAME_SHORT) $(DYNAMIC_SONAME_SHORT)
 	@ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_NAME_SHORT);   ln -s $(DYNAMIC_SONAME_SHORT) $(DYNAMIC_NAME_SHORT)
 
-$(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR) 
+$(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR)
 	@ echo AR -o $@
 	$(Q)ar rcs $@ $(OBJS)
 
diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp
index c57d98a78be..4fdf2a465a8 100644
--- a/include/caffe/util/gpu_memory.hpp
+++ b/include/caffe/util/gpu_memory.hpp
@@ -77,6 +77,8 @@ class gpu_memory {
     cudaStream_t  stream_;
     size_t        size_;
   };
+  static void update_dev_info(int device);
+
 # endif
 
  private:
@@ -89,8 +91,12 @@ class gpu_memory {
 
 #ifndef CPU_ONLY
   struct MemInfo {
-      size_t free;
-      size_t total;
+    MemInfo()  {
+      free = total = flush_count = 0;
+    }
+    size_t   free;
+    size_t   total;
+    unsigned flush_count;
   };
 
   static vector<MemInfo>  dev_info_;
@@ -102,7 +108,6 @@ class gpu_memory {
                        cudaStream_t stream = cudaStreamDefault);
   static void deallocate(pointer ptr, cudaStream_t = cudaStreamDefault);
 
-  static void registerStream(cudaStream_t stream);
   static void getInfo(size_t *free_mem, size_t *used_mem);
 
  private:
diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp
index 5e35ef43059..70c8a31c5ed 100644
--- a/src/caffe/util/gpu_memory.cpp
+++ b/src/caffe/util/gpu_memory.cpp
@@ -67,7 +67,30 @@ namespace caffe {
     CHECK((ptr) != NULL);
     switch (mode_) {
     case CubPool:
-      CUDA_CHECK(cubAlloc->DeviceAllocate(ptr, size, stream));
+      if (cubAlloc->DeviceAllocate(ptr, size, stream) != cudaSuccess)
+        {
+          int cur_device;
+          CUDA_CHECK(cudaGetDevice(&cur_device));
+          // free all cached memory (for all devices), synchrionize
+          cudaDeviceSynchronize();
+          cudaThreadSynchronize();
+          cubAlloc->FreeAllCached();
+          cudaDeviceSynchronize();
+          cudaThreadSynchronize();
+
+          // Refresh per-device saved values.
+          for (int i = 0; i < dev_info_.size(); i++) {
+            // only query devices that were initialized
+            if (dev_info_[i].total) {
+              update_dev_info(i);
+              // record which device caused cache flush
+              if (i == cur_device)
+                dev_info_[i].flush_count++;
+            }
+          }
+          // retry once
+          CUDA_CHECK(cubAlloc->DeviceAllocate(ptr, size, stream));
+        }
       break;
     default:
       CUDA_CHECK(cudaMalloc(ptr, size));
@@ -89,46 +112,43 @@ namespace caffe {
     }
   }
 
-  void gpu_memory::registerStream(cudaStream_t stream) {
-    switch (mode_) {
-    case CubPool:
-    default:
-      break;
-    }
-  }
 
-  void gpu_memory::initMEM(const std::vector<int>& gpus, PoolMode m) {
-    mode_ = m;
+  void gpu_memory::update_dev_info(int device) {
     int initial_device;
-
     CUDA_CHECK(cudaGetDevice(&initial_device));
 
+    if (device+1 > dev_info_.size())
+      dev_info_.resize(device+1);
+
+    CUDA_CHECK(cudaSetDevice(device));
+    cudaDeviceProp props;
+    CUDA_CHECK(cudaGetDeviceProperties(&props, device));
+    CUDA_CHECK(cudaMemGetInfo(&dev_info_[device].free,
+                              &dev_info_[device].total));
+
+    if (debug_) {
+      std::cout << "cudaGetDeviceProperties: Mem = "
+                << props.totalGlobalMem <<std::endl;
+      std::cout << "cudaMemGetInfo_[" << device
+                <<": Free= " << dev_info_[device].free
+                << " Total= " << dev_info_[device].total << std::endl;
+    }
 
-    for (int i = 0; i < gpus.size(); i++) {
-      int cur_device = gpus[i];
-      if (cur_device+1 > dev_info_.size())
-        dev_info_.resize(cur_device+1);
-
-      CUDA_CHECK(cudaSetDevice(gpus[i]));
-      cudaDeviceProp props;
-      CUDA_CHECK(cudaGetDeviceProperties(&props, cur_device));
-      CUDA_CHECK(cudaMemGetInfo(&dev_info_[cur_device].free,
-                                &dev_info_[cur_device].total));
+    // make sure we don't have more that total device memory
+    dev_info_[device].total = std::min(props.totalGlobalMem,
+                                           dev_info_[device].total);
 
-      if (debug_) {
-        std::cout << "cudaGetDeviceProperties: Mem = "
-                  << props.totalGlobalMem <<std::endl;
-        std::cout << "cudaMemGetInfo_[" << cur_device
-                  <<": Free= " << dev_info_[cur_device].free
-                  << " Total= " << dev_info_[cur_device].total << std::endl;
-      }
+    // here we are adding existing 'busy' allocations to CUDA free memory
+    dev_info_[device].free =
+      std::min(dev_info_[device].total,
+               dev_info_[device].free
+               + cubAlloc->cached_bytes[device].busy);
+    CUDA_CHECK(cudaSetDevice(initial_device));
+  }
 
-      // make sure we don't ask for more that total device memory
-      dev_info_[i].free = std::min(dev_info_[cur_device].total,
-                                   dev_info_[cur_device].free);
-      dev_info_[i].free = std::min(props.totalGlobalMem,
-                                   dev_info_[cur_device].free);
-    }
+  void gpu_memory::initMEM(const std::vector<int>& gpus, PoolMode m) {
+    mode_ = m;
+    int initial_device;
 
     switch ( mode_ ) {
       case CubPool:
@@ -145,12 +165,15 @@ namespace caffe {
         }
         catch (...) {}
         CHECK(cubAlloc);
+
+        for (int i = 0; i < gpus.size(); i++) {
+          update_dev_info(gpus[i]);
+        }
+
         break;
       default:
         break;
       }
-
-    CUDA_CHECK(cudaSetDevice(initial_device));
   }
 
   const char* gpu_memory::getPoolName()  {

From e1fe5491a58a28d2bb8f2f68940f127b1fbb3133 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 13 Jan 2016 21:54:19 -0800
Subject: [PATCH 8/8] Fixing Lint errors

---
 src/caffe/util/gpu_memory.cpp | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp
index 70c8a31c5ed..da990b88af0 100644
--- a/src/caffe/util/gpu_memory.cpp
+++ b/src/caffe/util/gpu_memory.cpp
@@ -29,8 +29,7 @@ namespace caffe {
 
   void gpu_memory::init(const std::vector<int>& gpus,
                         PoolMode m, bool debug) {
-
-    bool debug_env = (getenv("DEBUG_GPU_MEM")!=0);
+    bool debug_env = (getenv("DEBUG_GPU_MEM") != 0);
     debug_ = debug || debug_env;
 
     if (gpus.size() <= 0) {
@@ -67,8 +66,7 @@ namespace caffe {
     CHECK((ptr) != NULL);
     switch (mode_) {
     case CubPool:
-      if (cubAlloc->DeviceAllocate(ptr, size, stream) != cudaSuccess)
-        {
+      if (cubAlloc->DeviceAllocate(ptr, size, stream) != cudaSuccess) {
           int cur_device;
           CUDA_CHECK(cudaGetDevice(&cur_device));
           // free all cached memory (for all devices), synchrionize
@@ -148,8 +146,6 @@ namespace caffe {
 
   void gpu_memory::initMEM(const std::vector<int>& gpus, PoolMode m) {
     mode_ = m;
-    int initial_device;
-
     switch ( mode_ ) {
       case CubPool:
         try {
@@ -165,11 +161,9 @@ namespace caffe {
         }
         catch (...) {}
         CHECK(cubAlloc);
-
         for (int i = 0; i < gpus.size(); i++) {
           update_dev_info(gpus[i]);
         }
-
         break;
       default:
         break;