diff --git a/3rdparty/cub/cub/util_allocator.cuh b/3rdparty/cub/cub/util_allocator.cuh
index 8de32cdbc47..42b020c9b1c 100644
--- a/3rdparty/cub/cub/util_allocator.cuh
+++ b/3rdparty/cub/cub/util_allocator.cuh
@@ -408,10 +408,10 @@ struct CachingDeviceAllocator
                 && (block_itr->device == device)
                 && (block_itr->bin == search_key.bin)) {
 
-                // use special rule for the last ("exact size") bin: set max memory overuse to 1/8th
+          // use special rule for the last ("exact size") bin: set max memory overuse to 1/8th
           if (search_key.bin == (unsigned int) -1 && (block_itr->bytes - search_key.bytes)*8UL > search_key.bytes)
             break;
-            
+
           cudaStream_t prev_stream = block_itr->associated_stream;
 	  if ((active_stream == prev_stream)
 	      || (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
@@ -454,22 +454,12 @@ struct CachingDeviceAllocator
 	    if (error != cudaSuccess) {
 	      if (debug) CubLog("\tdevice %d failed to allocate %lld bytes for stream %lld",
 				device, (long long) search_key.bytes, (long long) search_key.associated_stream);
-
-	      // if (search_key.bytes < cached_bytes[device]) {
-	      // free all cached memory (for all devices), synchrionize and retry once
-	      cudaDeviceSynchronize();
-	      cudaThreadSynchronize();
-	      FreeAllCached();
-	      cudaDeviceSynchronize();
-	      cudaThreadSynchronize();
-	      error = cudaMalloc(&search_key.d_ptr, search_key.bytes);
-	      //	      }
 	    }
-	    if (CubDebug(error)) 
+	    if (CubDebug(error))
 	      return error;
-	    if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) 
+	    if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
 	      return error;
-	    
+
 	    // Insert into live blocks
 	    Lock(&spin_lock);
 	    live_blocks.insert(search_key);
@@ -548,30 +538,30 @@ struct CachingDeviceAllocator
 
 	// Lock
 	Lock(&spin_lock);
-	
+
 	// Find corresponding block descriptor
 	BusyBlocks::iterator block_itr = live_blocks.find(search_key);
 	if (block_itr != live_blocks.end()) {
 	  // Remove from live blocks
 	  search_key = *block_itr;
-	  live_blocks.erase(block_itr);	    
+	  live_blocks.erase(block_itr);
 	  cached_bytes[device].busy -= search_key.bytes;
-	  
+
 	  // Check if we should keep the returned allocation
 	  if (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)
-	    {	      
+	    {
 	      // Insert returned allocation into free blocks
 	      cached_blocks.insert(search_key);
 	      cached_bytes[device].free += search_key.bytes;
 	      recached = true;
-	      if (debug) { 
+	      if (debug) {
 		CubLog("\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
-		       device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), 
+		       device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
 		       (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy);
 	      }
 	    }
 	}
-	
+
         Unlock(&spin_lock);
 
         if (recached) {
@@ -585,7 +575,7 @@ struct CachingDeviceAllocator
 	      return error;
 	    if (CubDebug(error = cudaSetDevice(device))) return error;
 	  }
-	  
+
           // Actually free device memory
           if (CubDebug(error = cudaFree(d_ptr))) return error;
           if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
diff --git a/Makefile b/Makefile
index e2538d577a7..65703758574 100644
--- a/Makefile
+++ b/Makefile
@@ -571,7 +571,7 @@ $(DYNAMIC_NAME): $(OBJS)| $(LIB_BUILD_DIR)
 	@ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_SONAME_SHORT); ln -s $(DYNAMIC_VERSIONED_NAME_SHORT) $(DYNAMIC_SONAME_SHORT)
 	@ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_NAME_SHORT);   ln -s $(DYNAMIC_SONAME_SHORT) $(DYNAMIC_NAME_SHORT)
 
-$(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR) 
+$(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR)
 	@ echo AR -o $@
 	$(Q)ar rcs $@ $(OBJS)
 
diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp
index c57d98a78be..4fdf2a465a8 100644
--- a/include/caffe/util/gpu_memory.hpp
+++ b/include/caffe/util/gpu_memory.hpp
@@ -77,6 +77,8 @@ class gpu_memory {
     cudaStream_t  stream_;
     size_t        size_;
   };
+  static void update_dev_info(int device);
+
 # endif
 
  private:
@@ -89,8 +91,12 @@ class gpu_memory {
 
 #ifndef CPU_ONLY
   struct MemInfo {
-      size_t free;
-      size_t total;
+    MemInfo()  {
+      free = total = flush_count = 0;
+    }
+    size_t   free;
+    size_t   total;
+    unsigned flush_count;
   };
 
   static vector<MemInfo>  dev_info_;
@@ -102,7 +108,6 @@ class gpu_memory {
                        cudaStream_t stream = cudaStreamDefault);
   static void deallocate(pointer ptr, cudaStream_t = cudaStreamDefault);
 
-  static void registerStream(cudaStream_t stream);
   static void getInfo(size_t *free_mem, size_t *used_mem);
 
  private:
diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp
index 8db4178530e..da990b88af0 100644
--- a/src/caffe/util/gpu_memory.cpp
+++ b/src/caffe/util/gpu_memory.cpp
@@ -26,9 +26,12 @@ namespace caffe {
     return "No GPU: CPU Only Memory";
   }
 #else
+
   void gpu_memory::init(const std::vector<int>& gpus,
                         PoolMode m, bool debug) {
-    debug_ = debug;
+    bool debug_env = (getenv("DEBUG_GPU_MEM") != 0);
+    debug_ = debug || debug_env;
+
     if (gpus.size() <= 0) {
       // should we report an error here ?
       m = gpu_memory::NoPool;
@@ -63,7 +66,29 @@ namespace caffe {
     CHECK((ptr) != NULL);
     switch (mode_) {
     case CubPool:
-      CUDA_CHECK(cubAlloc->DeviceAllocate(ptr, size, stream));
+      if (cubAlloc->DeviceAllocate(ptr, size, stream) != cudaSuccess) {
+          int cur_device;
+          CUDA_CHECK(cudaGetDevice(&cur_device));
+          // free all cached memory (for all devices), synchrionize
+          cudaDeviceSynchronize();
+          cudaThreadSynchronize();
+          cubAlloc->FreeAllCached();
+          cudaDeviceSynchronize();
+          cudaThreadSynchronize();
+
+          // Refresh per-device saved values.
+          for (int i = 0; i < dev_info_.size(); i++) {
+            // only query devices that were initialized
+            if (dev_info_[i].total) {
+              update_dev_info(i);
+              // record which device caused cache flush
+              if (i == cur_device)
+                dev_info_[i].flush_count++;
+            }
+          }
+          // retry once
+          CUDA_CHECK(cubAlloc->DeviceAllocate(ptr, size, stream));
+        }
       break;
     default:
       CUDA_CHECK(cudaMalloc(ptr, size));
@@ -85,47 +110,42 @@ namespace caffe {
     }
   }
 
-  void gpu_memory::registerStream(cudaStream_t stream) {
-    switch (mode_) {
-    case CubPool:
-    default:
-      break;
-    }
-  }
 
-  void gpu_memory::initMEM(const std::vector<int>& gpus, PoolMode m) {
-    mode_ = m;
+  void gpu_memory::update_dev_info(int device) {
     int initial_device;
-
     CUDA_CHECK(cudaGetDevice(&initial_device));
 
-    for (int i = 0; i < gpus.size(); i++) {
-        int cur_device = gpus[i];
-        if (cur_device+1 > dev_info_.size())
-            dev_info_.resize(cur_device+1);
-
-      CUDA_CHECK(cudaSetDevice(gpus[i]));
-      cudaDeviceProp props;
-      CUDA_CHECK(cudaGetDeviceProperties(&props, cur_device));
-      CUDA_CHECK(cudaMemGetInfo(&dev_info_[cur_device].free,
-                                &dev_info_[cur_device].total));
-
-      if (debug_) {
-        std::cout << "cudaGetDeviceProperties: Mem = "
-                  << props.totalGlobalMem <<std::endl;
-        std::cout << "cudaMemGetInfo_[" << cur_device
-                  <<": Free= " << dev_info_[cur_device].free
-                  << " Total= " << dev_info_[cur_device].total << std::endl;
-      }
-
-      // make sure we don't ask for more that total device memory
-      dev_info_[i].free = std::min(dev_info_[cur_device].total,
-                                   dev_info_[cur_device].free);
-      dev_info_[i].free = std::min(props.totalGlobalMem,
-                                   dev_info_[cur_device].free);
+    if (device+1 > dev_info_.size())
+      dev_info_.resize(device+1);
+
+    CUDA_CHECK(cudaSetDevice(device));
+    cudaDeviceProp props;
+    CUDA_CHECK(cudaGetDeviceProperties(&props, device));
+    CUDA_CHECK(cudaMemGetInfo(&dev_info_[device].free,
+                              &dev_info_[device].total));
+
+    if (debug_) {
+      std::cout << "cudaGetDeviceProperties: Mem = "
+                << props.totalGlobalMem <<std::endl;
+      std::cout << "cudaMemGetInfo_[" << device
+                <<": Free= " << dev_info_[device].free
+                << " Total= " << dev_info_[device].total << std::endl;
     }
 
+    // make sure we don't have more that total device memory
+    dev_info_[device].total = std::min(props.totalGlobalMem,
+                                           dev_info_[device].total);
+
+    // here we are adding existing 'busy' allocations to CUDA free memory
+    dev_info_[device].free =
+      std::min(dev_info_[device].total,
+               dev_info_[device].free
+               + cubAlloc->cached_bytes[device].busy);
+    CUDA_CHECK(cudaSetDevice(initial_device));
+  }
 
+  void gpu_memory::initMEM(const std::vector<int>& gpus, PoolMode m) {
+    mode_ = m;
     switch ( mode_ ) {
       case CubPool:
         try {
@@ -141,12 +161,13 @@ namespace caffe {
         }
         catch (...) {}
         CHECK(cubAlloc);
+        for (int i = 0; i < gpus.size(); i++) {
+          update_dev_info(gpus[i]);
+        }
         break;
       default:
         break;
       }
-
-    CUDA_CHECK(cudaSetDevice(initial_device));
   }
 
   const char* gpu_memory::getPoolName()  {
@@ -167,7 +188,7 @@ namespace caffe {
       // Free memory is initial free memory minus outstanding allocations.
       // Assuming we only allocate via gpu_memory since its constructon.
       *free_mem = dev_info_[cur_device].free -
-          cubAlloc->cached_bytes[cur_device].busy;
+        cubAlloc->cached_bytes[cur_device].busy;
       break;
     default:
       CUDA_CHECK(cudaMemGetInfo(free_mem, total_mem));