From b52ab92e59e9efea5e8b58548f70558c1b26e41b Mon Sep 17 00:00:00 2001
From: thatguymike <mike.houston@gmail.com>
Date: Wed, 2 Dec 2015 20:27:18 -0800
Subject: [PATCH] Cross merge fixes from Boris

---
 3rdparty/cub/cub/util_allocator.cuh |  7 ++++-
 include/caffe/util/gpu_memory.hpp   | 14 ++++++---
 src/caffe/util/gpu_memory.cpp       | 49 +++++++++++++++--------------
 tools/caffe.cpp                     |  2 +-
 4 files changed, 42 insertions(+), 30 deletions(-)
diff --git a/3rdparty/cub/cub/util_allocator.cuh b/3rdparty/cub/cub/util_allocator.cuh
index 7f6bebe7f74..8de32cdbc47 100644
--- a/3rdparty/cub/cub/util_allocator.cuh
+++ b/3rdparty/cub/cub/util_allocator.cuh
@@ -407,6 +407,11 @@ struct CachingDeviceAllocator
         while ( (block_itr != cached_blocks.end())
                 && (block_itr->device == device)
                 && (block_itr->bin == search_key.bin)) {
+
+                // use special rule for the last ("exact size") bin: set max memory overuse to 1/8th
+          if (search_key.bin == (unsigned int) -1 && (block_itr->bytes - search_key.bytes)*8UL > search_key.bytes)
+            break;
+            
           cudaStream_t prev_stream = block_itr->associated_stream;
 	  if ((active_stream == prev_stream)
 	      || (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
@@ -662,7 +667,7 @@ struct CachingDeviceAllocator
             cached_blocks.erase(begin);
 
             if (debug) CubLog("\tdevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-                              current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].free);
+                              current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].busy);
         }
 
         Unlock(&spin_lock);
diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp
index b1a6c919cf7..c57d98a78be 100644
--- a/include/caffe/util/gpu_memory.hpp
+++ b/include/caffe/util/gpu_memory.hpp
@@ -83,11 +83,17 @@ class gpu_memory {
   static void init(const std::vector<int>&, PoolMode, bool);
   static void destroy();
 
-  static bool     initialized_;
-  static PoolMode mode_;
-  static size_t   poolsize_;
-  static bool     debug_;
+  static bool             initialized_;
+  static PoolMode         mode_;
+  static bool             debug_;
+
 #ifndef CPU_ONLY
+  struct MemInfo {
+      size_t free;
+      size_t total;
+  };
+
+  static vector<MemInfo>  dev_info_;
 
  public:
   typedef void* pointer;
diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp
index 65e9b1cc19c..8db4178530e 100644
--- a/src/caffe/util/gpu_memory.cpp
+++ b/src/caffe/util/gpu_memory.cpp
@@ -12,10 +12,10 @@ namespace caffe {
 
 #ifndef CPU_ONLY  // CPU-only Caffe.
   static cub::CachingDeviceAllocator* cubAlloc = 0;
+  vector<gpu_memory::MemInfo> gpu_memory::dev_info_;
 #endif
 
   gpu_memory::PoolMode gpu_memory::mode_   = gpu_memory::NoPool;
-  size_t               gpu_memory::poolsize_ = 0;
   bool                 gpu_memory::debug_ = false;
 
 #ifdef CPU_ONLY  // CPU-only Caffe.
@@ -43,8 +43,7 @@ namespace caffe {
     }
     if (debug)
       std::cout << "gpu_memory initialized with "
-                << getPoolName() << ". Poolsize = "
-                << (1.0*poolsize_)/(1024.0*1024.0*1024.0) << " G." << std::endl;
+                << getPoolName()  << std::endl;
   }
 
   void gpu_memory::destroy() {
@@ -101,42 +100,42 @@ namespace caffe {
     CUDA_CHECK(cudaGetDevice(&initial_device));
 
     for (int i = 0; i < gpus.size(); i++) {
+        int cur_device = gpus[i];
+        if (cur_device+1 > dev_info_.size())
+            dev_info_.resize(cur_device+1);
+
       CUDA_CHECK(cudaSetDevice(gpus[i]));
-      size_t free_mem, total_mem;
       cudaDeviceProp props;
-      CUDA_CHECK(cudaGetDeviceProperties(&props, gpus[i]));
-      CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem));
+      CUDA_CHECK(cudaGetDeviceProperties(&props, cur_device));
+      CUDA_CHECK(cudaMemGetInfo(&dev_info_[cur_device].free,
+                                &dev_info_[cur_device].total));
 
       if (debug_) {
         std::cout << "cudaGetDeviceProperties: Mem = "
-                  << props.totalGlobalMem <<std:: endl;
-        std::cout << "cudaMemGetInfo: Free= " << free_mem
-                  << " Total= " << total_mem << std::endl;
+                  << props.totalGlobalMem <<std::endl;
+        std::cout << "cudaMemGetInfo_[" << cur_device
+                  <<": Free= " << dev_info_[cur_device].free
+                  << " Total= " << dev_info_[cur_device].total << std::endl;
       }
 
       // make sure we don't ask for more that total device memory
-      free_mem = std::min(total_mem, free_mem);
-      free_mem = size_t(0.95*std::min(props.totalGlobalMem, free_mem));
-      // find out the smallest GPU size
-      if (poolsize_ == 0 || poolsize_ > free_mem)
-        poolsize_ = free_mem;
+      dev_info_[i].free = std::min(dev_info_[cur_device].total,
+                                   dev_info_[cur_device].free);
+      dev_info_[i].free = std::min(props.totalGlobalMem,
+                                   dev_info_[cur_device].free);
     }
 
 
     switch ( mode_ ) {
       case CubPool:
         try {
-          // if you are paranoid, that doesn't mean they are not after you :)
+          // just in case someone installed 'no cleanup' arena before
           delete cubAlloc;
 
-          cubAlloc = new cub::CachingDeviceAllocator( 2,   // defaults
+          cubAlloc = new cub::CachingDeviceAllocator( 2,
                                                       6,
-                                                      32,  // largest
-                                                           // cached
-                                                           // allocation
-                                                           // becomes
-                                                           // 2^32 here
-                                                      poolsize_,
+                                                      16,
+                                                      (size_t)-1,
                                                       false,
                                                       debug_);
         }
@@ -164,13 +163,15 @@ namespace caffe {
     case CubPool:
       int cur_device;
       CUDA_CHECK(cudaGetDevice(&cur_device));
-      *total_mem = poolsize_;
+      *total_mem = dev_info_[cur_device].total;
       // Free memory is initial free memory minus outstanding allocations.
       // Assuming we only allocate via gpu_memory since its constructon.
-      *free_mem = poolsize_ - cubAlloc->cached_bytes[cur_device].busy;
+      *free_mem = dev_info_[cur_device].free -
+          cubAlloc->cached_bytes[cur_device].busy;
       break;
     default:
       CUDA_CHECK(cudaMemGetInfo(free_mem, total_mem));
+      break;
     }
   }
 #endif  // CPU_ONLY
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index 306a5a55d34..abba022d998 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -407,7 +407,7 @@ int main(int argc, char** argv) {
   // initialize gpu memory arena
   vector<int> gpus;
   get_gpus(&gpus);
-  caffe::gpu_memory::arena arena(gpus);
+  caffe::gpu_memory::arena arena(gpus, caffe::gpu_memory::DefaultPool, false);
 
   if (argc == 2) {
 #ifdef WITH_PYTHON_LAYER