From 4e936f7c8dbae5b6e33c9aad8e162e3edb02702d Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Tue, 1 Dec 2015 21:48:27 -0800 Subject: [PATCH 1/4] Fixed large allocations cache --- 3rdparty/cub/cub/util_allocator.cuh | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/3rdparty/cub/cub/util_allocator.cuh b/3rdparty/cub/cub/util_allocator.cuh index 7f6bebe7f74..e0e0e4562eb 100644 --- a/3rdparty/cub/cub/util_allocator.cuh +++ b/3rdparty/cub/cub/util_allocator.cuh @@ -407,6 +407,11 @@ struct CachingDeviceAllocator while ( (block_itr != cached_blocks.end()) && (block_itr->device == device) && (block_itr->bin == search_key.bin)) { + + // use special rule for the last ("exact size") bin: set max memory overuse to 1/8th + if (search_key.bin == (unsigned int) -1 && (block_itr->bytes - search_key.bytes)*8UL > search_key.bytes) + break; + cudaStream_t prev_stream = block_itr->associated_stream; if ((active_stream == prev_stream) || (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) { @@ -460,11 +465,11 @@ struct CachingDeviceAllocator error = cudaMalloc(&search_key.d_ptr, search_key.bytes); // } } - if (CubDebug(error)) + if (CubDebug(error)) return error; - if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) + if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) return error; - + // Insert into live blocks Lock(&spin_lock); live_blocks.insert(search_key); @@ -543,30 +548,30 @@ struct CachingDeviceAllocator // Lock Lock(&spin_lock); - + // Find corresponding block descriptor BusyBlocks::iterator block_itr = live_blocks.find(search_key); if (block_itr != live_blocks.end()) { // Remove from live blocks search_key = *block_itr; - live_blocks.erase(block_itr); + live_blocks.erase(block_itr); cached_bytes[device].busy -= search_key.bytes; - + // Check if we should keep the returned allocation if (cached_bytes[device].free + search_key.bytes <= max_cached_bytes) - { + { // Insert returned allocation into free blocks cached_blocks.insert(search_key); cached_bytes[device].free += search_key.bytes; recached = true; - if (debug) { + if (debug) { CubLog("\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", - device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), + device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy); } } } - + Unlock(&spin_lock); if (recached) { @@ -580,7 +585,7 @@ struct CachingDeviceAllocator return error; if (CubDebug(error = cudaSetDevice(device))) return error; } - + // Actually free device memory if (CubDebug(error = cudaFree(d_ptr))) return error; if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error; From b61ebf0eec6ab8c423d2cc00973886d14b776322 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Tue, 1 Dec 2015 23:42:00 -0800 Subject: [PATCH 2/4] Fixing GPU arena setup --- include/caffe/util/gpu_memory.hpp | 14 +++++++--- src/caffe/util/gpu_memory.cpp | 46 +++++++++++++++++-------------- 2 files changed, 36 insertions(+), 24 deletions(-) diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp index b1a6c919cf7..c57d98a78be 100644 --- a/include/caffe/util/gpu_memory.hpp +++ b/include/caffe/util/gpu_memory.hpp @@ -83,11 +83,17 @@ class gpu_memory { static void init(const std::vector&, PoolMode, bool); static void destroy(); - static bool initialized_; - static PoolMode mode_; - static size_t poolsize_; - static bool debug_; + static bool initialized_; + static PoolMode mode_; + static bool debug_; + #ifndef CPU_ONLY + struct MemInfo { + size_t free; + size_t total; + }; + + static vector dev_info_; public: typedef void* pointer; diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp index 5dd312153cc..30e404e5f1b 100644 --- a/src/caffe/util/gpu_memory.cpp +++ b/src/caffe/util/gpu_memory.cpp @@ -12,10 +12,10 @@ namespace caffe { #ifndef CPU_ONLY // CPU-only Caffe. static cub::CachingDeviceAllocator* cubAlloc = 0; + vector gpu_memory::dev_info_; #endif gpu_memory::PoolMode gpu_memory::mode_ = gpu_memory::NoPool; - size_t gpu_memory::poolsize_ = 0; bool gpu_memory::debug_ = false; #ifdef CPU_ONLY // CPU-only Caffe. @@ -43,8 +43,7 @@ namespace caffe { } if (debug) std::cout << "gpu_memory initialized with " - << getPoolName() << ". Poolsize = " - << (1.0*poolsize_)/(1024.0*1024.0*1024.0) << " G." << std::endl; + << getPoolName() << std::endl; } void gpu_memory::destroy() { @@ -100,39 +99,44 @@ namespace caffe { CUDA_CHECK(cudaGetDevice(&initial_device)); + for (int i = 0; i < gpus.size(); i++) { + int cur_device = gpus[i]; + if (cur_device+1 > dev_info_.size()) + dev_info_.resize(cur_device+1); + CUDA_CHECK(cudaSetDevice(gpus[i])); - size_t free_mem, total_mem; cudaDeviceProp props; - CUDA_CHECK(cudaGetDeviceProperties(&props, gpus[i])); - CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem)); + CUDA_CHECK(cudaGetDeviceProperties(&props, cur_device)); + CUDA_CHECK(cudaMemGetInfo(&dev_info_[cur_device].free, + &dev_info_[cur_device].total)); if (debug_) { std::cout << "cudaGetDeviceProperties: Mem = " - << props.totalGlobalMem <bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].free); + current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].busy); } Unlock(&spin_lock); From 1c678452b9034aa891850f749e00d6df6cb044b8 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Wed, 2 Dec 2015 15:23:03 -0800 Subject: [PATCH 4/4] d --- tools/caffe.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/caffe.cpp b/tools/caffe.cpp index f6dbb89ec2d..306a5a55d34 100644 --- a/tools/caffe.cpp +++ b/tools/caffe.cpp @@ -407,7 +407,7 @@ int main(int argc, char** argv) { // initialize gpu memory arena vector gpus; get_gpus(&gpus); - caffe::gpu_memory::arena arena(gpus, caffe::gpu_memory::DefaultPool, true); + caffe::gpu_memory::arena arena(gpus); if (argc == 2) { #ifdef WITH_PYTHON_LAYER