diff --git a/3rdparty/cub/cub/util_allocator.cuh b/3rdparty/cub/cub/util_allocator.cuh index 8de32cdbc47..42b020c9b1c 100644 --- a/3rdparty/cub/cub/util_allocator.cuh +++ b/3rdparty/cub/cub/util_allocator.cuh @@ -408,10 +408,10 @@ struct CachingDeviceAllocator && (block_itr->device == device) && (block_itr->bin == search_key.bin)) { - // use special rule for the last ("exact size") bin: set max memory overuse to 1/8th + // use special rule for the last ("exact size") bin: set max memory overuse to 1/8th if (search_key.bin == (unsigned int) -1 && (block_itr->bytes - search_key.bytes)*8UL > search_key.bytes) break; - + cudaStream_t prev_stream = block_itr->associated_stream; if ((active_stream == prev_stream) || (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) { @@ -454,22 +454,12 @@ struct CachingDeviceAllocator if (error != cudaSuccess) { if (debug) CubLog("\tdevice %d failed to allocate %lld bytes for stream %lld", device, (long long) search_key.bytes, (long long) search_key.associated_stream); - - // if (search_key.bytes < cached_bytes[device]) { - // free all cached memory (for all devices), synchrionize and retry once - cudaDeviceSynchronize(); - cudaThreadSynchronize(); - FreeAllCached(); - cudaDeviceSynchronize(); - cudaThreadSynchronize(); - error = cudaMalloc(&search_key.d_ptr, search_key.bytes); - // } } - if (CubDebug(error)) + if (CubDebug(error)) return error; - if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) + if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) return error; - + // Insert into live blocks Lock(&spin_lock); live_blocks.insert(search_key); @@ -548,30 +538,30 @@ struct CachingDeviceAllocator // Lock Lock(&spin_lock); - + // Find corresponding block descriptor BusyBlocks::iterator block_itr = live_blocks.find(search_key); if (block_itr != live_blocks.end()) { // Remove from live blocks search_key = *block_itr; - live_blocks.erase(block_itr); + live_blocks.erase(block_itr); cached_bytes[device].busy -= search_key.bytes; - + // Check if we should keep the returned allocation if (cached_bytes[device].free + search_key.bytes <= max_cached_bytes) - { + { // Insert returned allocation into free blocks cached_blocks.insert(search_key); cached_bytes[device].free += search_key.bytes; recached = true; - if (debug) { + if (debug) { CubLog("\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", - device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), + device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy); } } } - + Unlock(&spin_lock); if (recached) { @@ -585,7 +575,7 @@ struct CachingDeviceAllocator return error; if (CubDebug(error = cudaSetDevice(device))) return error; } - + // Actually free device memory if (CubDebug(error = cudaFree(d_ptr))) return error; if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error; diff --git a/Makefile b/Makefile index e2538d577a7..65703758574 100644 --- a/Makefile +++ b/Makefile @@ -571,7 +571,7 @@ $(DYNAMIC_NAME): $(OBJS)| $(LIB_BUILD_DIR) @ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_SONAME_SHORT); ln -s $(DYNAMIC_VERSIONED_NAME_SHORT) $(DYNAMIC_SONAME_SHORT) @ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_NAME_SHORT); ln -s $(DYNAMIC_SONAME_SHORT) $(DYNAMIC_NAME_SHORT) -$(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR) +$(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR) @ echo AR -o $@ $(Q)ar rcs $@ $(OBJS) diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp index c57d98a78be..4fdf2a465a8 100644 --- a/include/caffe/util/gpu_memory.hpp +++ b/include/caffe/util/gpu_memory.hpp @@ -77,6 +77,8 @@ class gpu_memory { cudaStream_t stream_; size_t size_; }; + static void update_dev_info(int device); + # endif private: @@ -89,8 +91,12 @@ class gpu_memory { #ifndef CPU_ONLY struct MemInfo { - size_t free; - size_t total; + MemInfo() { + free = total = flush_count = 0; + } + size_t free; + size_t total; + unsigned flush_count; }; static vector dev_info_; @@ -102,7 +108,6 @@ class gpu_memory { cudaStream_t stream = cudaStreamDefault); static void deallocate(pointer ptr, cudaStream_t = cudaStreamDefault); - static void registerStream(cudaStream_t stream); static void getInfo(size_t *free_mem, size_t *used_mem); private: diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp index 8db4178530e..da990b88af0 100644 --- a/src/caffe/util/gpu_memory.cpp +++ b/src/caffe/util/gpu_memory.cpp @@ -26,9 +26,12 @@ namespace caffe { return "No GPU: CPU Only Memory"; } #else + void gpu_memory::init(const std::vector& gpus, PoolMode m, bool debug) { - debug_ = debug; + bool debug_env = (getenv("DEBUG_GPU_MEM") != 0); + debug_ = debug || debug_env; + if (gpus.size() <= 0) { // should we report an error here ? m = gpu_memory::NoPool; @@ -63,7 +66,29 @@ namespace caffe { CHECK((ptr) != NULL); switch (mode_) { case CubPool: - CUDA_CHECK(cubAlloc->DeviceAllocate(ptr, size, stream)); + if (cubAlloc->DeviceAllocate(ptr, size, stream) != cudaSuccess) { + int cur_device; + CUDA_CHECK(cudaGetDevice(&cur_device)); + // free all cached memory (for all devices), synchrionize + cudaDeviceSynchronize(); + cudaThreadSynchronize(); + cubAlloc->FreeAllCached(); + cudaDeviceSynchronize(); + cudaThreadSynchronize(); + + // Refresh per-device saved values. + for (int i = 0; i < dev_info_.size(); i++) { + // only query devices that were initialized + if (dev_info_[i].total) { + update_dev_info(i); + // record which device caused cache flush + if (i == cur_device) + dev_info_[i].flush_count++; + } + } + // retry once + CUDA_CHECK(cubAlloc->DeviceAllocate(ptr, size, stream)); + } break; default: CUDA_CHECK(cudaMalloc(ptr, size)); @@ -85,47 +110,42 @@ namespace caffe { } } - void gpu_memory::registerStream(cudaStream_t stream) { - switch (mode_) { - case CubPool: - default: - break; - } - } - void gpu_memory::initMEM(const std::vector& gpus, PoolMode m) { - mode_ = m; + void gpu_memory::update_dev_info(int device) { int initial_device; - CUDA_CHECK(cudaGetDevice(&initial_device)); - for (int i = 0; i < gpus.size(); i++) { - int cur_device = gpus[i]; - if (cur_device+1 > dev_info_.size()) - dev_info_.resize(cur_device+1); - - CUDA_CHECK(cudaSetDevice(gpus[i])); - cudaDeviceProp props; - CUDA_CHECK(cudaGetDeviceProperties(&props, cur_device)); - CUDA_CHECK(cudaMemGetInfo(&dev_info_[cur_device].free, - &dev_info_[cur_device].total)); - - if (debug_) { - std::cout << "cudaGetDeviceProperties: Mem = " - << props.totalGlobalMem <