From 4e936f7c8dbae5b6e33c9aad8e162e3edb02702d Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Tue, 1 Dec 2015 21:48:27 -0800
Subject: [PATCH 1/4] Fixed large allocations cache

---
 3rdparty/cub/cub/util_allocator.cuh | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/3rdparty/cub/cub/util_allocator.cuh b/3rdparty/cub/cub/util_allocator.cuh
index 7f6bebe7f74..e0e0e4562eb 100644
--- a/3rdparty/cub/cub/util_allocator.cuh
+++ b/3rdparty/cub/cub/util_allocator.cuh
@@ -407,6 +407,11 @@ struct CachingDeviceAllocator
         while ( (block_itr != cached_blocks.end())
                 && (block_itr->device == device)
                 && (block_itr->bin == search_key.bin)) {
+
+          // use special rule for the last ("exact size") bin: set max memory overuse to 1/8th
+          if (search_key.bin == (unsigned int) -1 && (block_itr->bytes - search_key.bytes)*8UL > search_key.bytes)
+            break;
+
           cudaStream_t prev_stream = block_itr->associated_stream;
 	  if ((active_stream == prev_stream)
 	      || (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
@@ -460,11 +465,11 @@ struct CachingDeviceAllocator
 	      error = cudaMalloc(&search_key.d_ptr, search_key.bytes);
 	      //	      }
 	    }
-	    if (CubDebug(error)) 
+	    if (CubDebug(error))
 	      return error;
-	    if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) 
+	    if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
 	      return error;
-	    
+
 	    // Insert into live blocks
 	    Lock(&spin_lock);
 	    live_blocks.insert(search_key);
@@ -543,30 +548,30 @@ struct CachingDeviceAllocator
 
 	// Lock
 	Lock(&spin_lock);
-	
+
 	// Find corresponding block descriptor
 	BusyBlocks::iterator block_itr = live_blocks.find(search_key);
 	if (block_itr != live_blocks.end()) {
 	  // Remove from live blocks
 	  search_key = *block_itr;
-	  live_blocks.erase(block_itr);	    
+	  live_blocks.erase(block_itr);
 	  cached_bytes[device].busy -= search_key.bytes;
-	  
+
 	  // Check if we should keep the returned allocation
 	  if (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)
-	    {	      
+	    {
 	      // Insert returned allocation into free blocks
 	      cached_blocks.insert(search_key);
 	      cached_bytes[device].free += search_key.bytes;
 	      recached = true;
-	      if (debug) { 
+	      if (debug) {
 		CubLog("\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
-		       device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), 
+		       device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
 		       (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy);
 	      }
 	    }
 	}
-	
+
         Unlock(&spin_lock);
 
         if (recached) {
@@ -580,7 +585,7 @@ struct CachingDeviceAllocator
 	      return error;
 	    if (CubDebug(error = cudaSetDevice(device))) return error;
 	  }
-	  
+
           // Actually free device memory
           if (CubDebug(error = cudaFree(d_ptr))) return error;
           if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;

From b61ebf0eec6ab8c423d2cc00973886d14b776322 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Tue, 1 Dec 2015 23:42:00 -0800
Subject: [PATCH 2/4] Fixing GPU arena setup

---
 include/caffe/util/gpu_memory.hpp | 14 +++++++---
 src/caffe/util/gpu_memory.cpp     | 46 +++++++++++++++++--------------
 2 files changed, 36 insertions(+), 24 deletions(-)

diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp
index b1a6c919cf7..c57d98a78be 100644
--- a/include/caffe/util/gpu_memory.hpp
+++ b/include/caffe/util/gpu_memory.hpp
@@ -83,11 +83,17 @@ class gpu_memory {
   static void init(const std::vector<int>&, PoolMode, bool);
   static void destroy();
 
-  static bool     initialized_;
-  static PoolMode mode_;
-  static size_t   poolsize_;
-  static bool     debug_;
+  static bool             initialized_;
+  static PoolMode         mode_;
+  static bool             debug_;
+
 #ifndef CPU_ONLY
+  struct MemInfo {
+      size_t free;
+      size_t total;
+  };
+
+  static vector<MemInfo>  dev_info_;
 
  public:
   typedef void* pointer;
diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp
index 5dd312153cc..30e404e5f1b 100644
--- a/src/caffe/util/gpu_memory.cpp
+++ b/src/caffe/util/gpu_memory.cpp
@@ -12,10 +12,10 @@ namespace caffe {
 
 #ifndef CPU_ONLY  // CPU-only Caffe.
   static cub::CachingDeviceAllocator* cubAlloc = 0;
+  vector<gpu_memory::MemInfo> gpu_memory::dev_info_;
 #endif
 
   gpu_memory::PoolMode gpu_memory::mode_   = gpu_memory::NoPool;
-  size_t               gpu_memory::poolsize_ = 0;
   bool                 gpu_memory::debug_ = false;
 
 #ifdef CPU_ONLY  // CPU-only Caffe.
@@ -43,8 +43,7 @@ namespace caffe {
     }
     if (debug)
       std::cout << "gpu_memory initialized with "
-                << getPoolName() << ". Poolsize = "
-                << (1.0*poolsize_)/(1024.0*1024.0*1024.0) << " G." << std::endl;
+                << getPoolName()  << std::endl;
   }
 
   void gpu_memory::destroy() {
@@ -100,39 +99,44 @@ namespace caffe {
 
     CUDA_CHECK(cudaGetDevice(&initial_device));
 
+
     for (int i = 0; i < gpus.size(); i++) {
+      int cur_device = gpus[i];
+      if (cur_device+1 > dev_info_.size())
+        dev_info_.resize(cur_device+1);
+
       CUDA_CHECK(cudaSetDevice(gpus[i]));
-      size_t free_mem, total_mem;
       cudaDeviceProp props;
-      CUDA_CHECK(cudaGetDeviceProperties(&props, gpus[i]));
-      CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem));
+      CUDA_CHECK(cudaGetDeviceProperties(&props, cur_device));
+      CUDA_CHECK(cudaMemGetInfo(&dev_info_[cur_device].free,
+                                &dev_info_[cur_device].total));
 
       if (debug_) {
         std::cout << "cudaGetDeviceProperties: Mem = "
-                  << props.totalGlobalMem <<std:: endl;
-        std::cout << "cudaMemGetInfo: Free= " << free_mem
-                  << " Total= " << total_mem << std::endl;
+                  << props.totalGlobalMem <<std::endl;
+        std::cout << "cudaMemGetInfo_[" << cur_device
+                  <<": Free= " << dev_info_[cur_device].free
+                  << " Total= " << dev_info_[cur_device].total << std::endl;
       }
 
       // make sure we don't ask for more that total device memory
-      free_mem = std::min(total_mem, free_mem);
-      free_mem = size_t(0.95*std::min(props.totalGlobalMem, free_mem));
-      // find out the smallest GPU size
-      if (poolsize_ == 0 || poolsize_ > free_mem)
-        poolsize_ = free_mem;
-    }
+      dev_info_[i].free = std::min(dev_info_[cur_device].total,
+                                   dev_info_[cur_device].free);
+      dev_info_[i].free = std::min(props.totalGlobalMem,
+                                   dev_info_[cur_device].free);
+     }
 
 
     switch ( mode_ ) {
       case CubPool:
         try {
-          // if you are paranoid, that doesn't mean they are not after you :)
+          // just in case someone installed 'no cleanup' arena before
           delete cubAlloc;
 
-          cubAlloc = new cub::CachingDeviceAllocator( 2,   // defaults
+          cubAlloc = new cub::CachingDeviceAllocator( 2,
                                                       6,
                                                       16,
-                                                      poolsize_,
+                                                      (size_t)-1,
                                                       false,
                                                       debug_);
         }
@@ -160,13 +164,15 @@ namespace caffe {
     case CubPool:
       int cur_device;
       CUDA_CHECK(cudaGetDevice(&cur_device));
-      *total_mem = poolsize_;
+      *total_mem = dev_info_[cur_device].total;
       // Free memory is initial free memory minus outstanding allocations.
       // Assuming we only allocate via gpu_memory since its constructon.
-      *free_mem = poolsize_ - cubAlloc->cached_bytes[cur_device].busy;
+      *free_mem = dev_info_[cur_device].free -
+        cubAlloc->cached_bytes[cur_device].busy;
       break;
     default:
       CUDA_CHECK(cudaMemGetInfo(free_mem, total_mem));
+      break;
     }
   }
 #endif  // CPU_ONLY

From 37d8e766cef37dd07fb3c35a1ad7c0b5a70913a3 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 2 Dec 2015 12:50:37 -0800
Subject: [PATCH 3/4] Fixed debug print

---
 3rdparty/cub/cub/util_allocator.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/cub/cub/util_allocator.cuh b/3rdparty/cub/cub/util_allocator.cuh
index e0e0e4562eb..ff19cad55ce 100644
--- a/3rdparty/cub/cub/util_allocator.cuh
+++ b/3rdparty/cub/cub/util_allocator.cuh
@@ -667,7 +667,7 @@ struct CachingDeviceAllocator
             cached_blocks.erase(begin);
 
             if (debug) CubLog("\tdevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-                              current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].free);
+                              current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].busy);
         }
 
         Unlock(&spin_lock);

From 1c678452b9034aa891850f749e00d6df6cb044b8 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 2 Dec 2015 15:23:03 -0800
Subject: [PATCH 4/4] d

---
 tools/caffe.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index f6dbb89ec2d..306a5a55d34 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -407,7 +407,7 @@ int main(int argc, char** argv) {
   // initialize gpu memory arena
   vector<int> gpus;
   get_gpus(&gpus);
-  caffe::gpu_memory::arena arena(gpus, caffe::gpu_memory::DefaultPool, true);
+  caffe::gpu_memory::arena arena(gpus);
 
   if (argc == 2) {
 #ifdef WITH_PYTHON_LAYER