From 4e936f7c8dbae5b6e33c9aad8e162e3edb02702d Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Tue, 1 Dec 2015 21:48:27 -0800 Subject: [PATCH 1/8] Fixed large allocations cache --- 3rdparty/cub/cub/util_allocator.cuh | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/3rdparty/cub/cub/util_allocator.cuh b/3rdparty/cub/cub/util_allocator.cuh index 7f6bebe7f74..e0e0e4562eb 100644 --- a/3rdparty/cub/cub/util_allocator.cuh +++ b/3rdparty/cub/cub/util_allocator.cuh @@ -407,6 +407,11 @@ struct CachingDeviceAllocator while ( (block_itr != cached_blocks.end()) && (block_itr->device == device) && (block_itr->bin == search_key.bin)) { + + // use special rule for the last ("exact size") bin: set max memory overuse to 1/8th + if (search_key.bin == (unsigned int) -1 && (block_itr->bytes - search_key.bytes)*8UL > search_key.bytes) + break; + cudaStream_t prev_stream = block_itr->associated_stream; if ((active_stream == prev_stream) || (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) { @@ -460,11 +465,11 @@ struct CachingDeviceAllocator error = cudaMalloc(&search_key.d_ptr, search_key.bytes); // } } - if (CubDebug(error)) + if (CubDebug(error)) return error; - if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) + if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) return error; - + // Insert into live blocks Lock(&spin_lock); live_blocks.insert(search_key); @@ -543,30 +548,30 @@ struct CachingDeviceAllocator // Lock Lock(&spin_lock); - + // Find corresponding block descriptor BusyBlocks::iterator block_itr = live_blocks.find(search_key); if (block_itr != live_blocks.end()) { // Remove from live blocks search_key = *block_itr; - live_blocks.erase(block_itr); + live_blocks.erase(block_itr); cached_bytes[device].busy -= search_key.bytes; - + // Check if we should keep the returned allocation if (cached_bytes[device].free + search_key.bytes <= max_cached_bytes) - { + { // Insert returned allocation into free blocks cached_blocks.insert(search_key); cached_bytes[device].free += search_key.bytes; recached = true; - if (debug) { + if (debug) { CubLog("\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", - device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), + device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy); } } } - + Unlock(&spin_lock); if (recached) { @@ -580,7 +585,7 @@ struct CachingDeviceAllocator return error; if (CubDebug(error = cudaSetDevice(device))) return error; } - + // Actually free device memory if (CubDebug(error = cudaFree(d_ptr))) return error; if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error; From b61ebf0eec6ab8c423d2cc00973886d14b776322 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Tue, 1 Dec 2015 23:42:00 -0800 Subject: [PATCH 2/8] Fixing GPU arena setup --- include/caffe/util/gpu_memory.hpp | 14 +++++++--- src/caffe/util/gpu_memory.cpp | 46 +++++++++++++++++-------------- 2 files changed, 36 insertions(+), 24 deletions(-) diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp index b1a6c919cf7..c57d98a78be 100644 --- a/include/caffe/util/gpu_memory.hpp +++ b/include/caffe/util/gpu_memory.hpp @@ -83,11 +83,17 @@ class gpu_memory { static void init(const std::vector&, PoolMode, bool); static void destroy(); - static bool initialized_; - static PoolMode mode_; - static size_t poolsize_; - static bool debug_; + static bool initialized_; + static PoolMode mode_; + static bool debug_; + #ifndef CPU_ONLY + struct MemInfo { + size_t free; + size_t total; + }; + + static vector dev_info_; public: typedef void* pointer; diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp index 5dd312153cc..30e404e5f1b 100644 --- a/src/caffe/util/gpu_memory.cpp +++ b/src/caffe/util/gpu_memory.cpp @@ -12,10 +12,10 @@ namespace caffe { #ifndef CPU_ONLY // CPU-only Caffe. static cub::CachingDeviceAllocator* cubAlloc = 0; + vector gpu_memory::dev_info_; #endif gpu_memory::PoolMode gpu_memory::mode_ = gpu_memory::NoPool; - size_t gpu_memory::poolsize_ = 0; bool gpu_memory::debug_ = false; #ifdef CPU_ONLY // CPU-only Caffe. @@ -43,8 +43,7 @@ namespace caffe { } if (debug) std::cout << "gpu_memory initialized with " - << getPoolName() << ". Poolsize = " - << (1.0*poolsize_)/(1024.0*1024.0*1024.0) << " G." << std::endl; + << getPoolName() << std::endl; } void gpu_memory::destroy() { @@ -100,39 +99,44 @@ namespace caffe { CUDA_CHECK(cudaGetDevice(&initial_device)); + for (int i = 0; i < gpus.size(); i++) { + int cur_device = gpus[i]; + if (cur_device+1 > dev_info_.size()) + dev_info_.resize(cur_device+1); + CUDA_CHECK(cudaSetDevice(gpus[i])); - size_t free_mem, total_mem; cudaDeviceProp props; - CUDA_CHECK(cudaGetDeviceProperties(&props, gpus[i])); - CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem)); + CUDA_CHECK(cudaGetDeviceProperties(&props, cur_device)); + CUDA_CHECK(cudaMemGetInfo(&dev_info_[cur_device].free, + &dev_info_[cur_device].total)); if (debug_) { std::cout << "cudaGetDeviceProperties: Mem = " - << props.totalGlobalMem <bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].free); + current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].busy); } Unlock(&spin_lock); From 1c678452b9034aa891850f749e00d6df6cb044b8 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Wed, 2 Dec 2015 15:23:03 -0800 Subject: [PATCH 4/8] d --- tools/caffe.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/caffe.cpp b/tools/caffe.cpp index f6dbb89ec2d..306a5a55d34 100644 --- a/tools/caffe.cpp +++ b/tools/caffe.cpp @@ -407,7 +407,7 @@ int main(int argc, char** argv) { // initialize gpu memory arena vector gpus; get_gpus(&gpus); - caffe::gpu_memory::arena arena(gpus, caffe::gpu_memory::DefaultPool, true); + caffe::gpu_memory::arena arena(gpus); if (argc == 2) { #ifdef WITH_PYTHON_LAYER From 81baeda1ff2b2e4bf28d98b0f3853fde7f603ac8 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Wed, 2 Dec 2015 21:21:23 -0800 Subject: [PATCH 5/8] adding environment switch for memory diagnostic --- src/caffe/util/gpu_memory.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp index 30e404e5f1b..b8761a7a2a2 100644 --- a/src/caffe/util/gpu_memory.cpp +++ b/src/caffe/util/gpu_memory.cpp @@ -26,9 +26,13 @@ namespace caffe { return "No GPU: CPU Only Memory"; } #else + void gpu_memory::init(const std::vector& gpus, PoolMode m, bool debug) { - debug_ = debug; + + bool debug_env = (getenv("DEBUG_GPU_MEM")!=0); + debug_ = debug || debug_env; + if (gpus.size() <= 0) { // should we report an error here ? m = gpu_memory::NoPool; From 3fc98dd71c6ef13d4ae0995663f9f8a7023f6357 Mon Sep 17 00:00:00 2001 From: Luke Yeager Date: Thu, 3 Dec 2015 13:28:36 -0800 Subject: [PATCH 6/8] Mark v0.14.0-rc.3 --- CMakeLists.txt | 2 +- Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5db96196761..f932ef909fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ cmake_minimum_required(VERSION 2.8.7) project(Caffe C CXX) # ---[ Caffe version -set(CAFFE_TARGET_VERSION "0.14.0-rc.2") +set(CAFFE_TARGET_VERSION "0.14.0-rc.3") set(CAFFE_TARGET_SOVERSION "0.14") add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION}) diff --git a/Makefile b/Makefile index 212488110d7..e2538d577a7 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ LIB_BUILD_DIR := $(BUILD_DIR)/lib STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a DYNAMIC_VERSION_MAJOR := 0 DYNAMIC_VERSION_MINOR := 14 -DYNAMIC_VERSION_REVISION := 0-rc.2 +DYNAMIC_VERSION_REVISION := 0-rc.3 DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR) DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_SONAME_SHORT).$(DYNAMIC_VERSION_REVISION) From 8a2ab8e098dc87ad0dd7fb2b76bf83bc3fdacda5 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Wed, 13 Jan 2016 14:18:13 -0800 Subject: [PATCH 7/8] gpu_memory cleanup, fixes possible init issue --- 3rdparty/cub/cub/util_allocator.cuh | 10 --- Makefile | 2 +- include/caffe/util/gpu_memory.hpp | 11 +++- src/caffe/util/gpu_memory.cpp | 95 ++++++++++++++++++----------- 4 files changed, 68 insertions(+), 50 deletions(-) diff --git a/3rdparty/cub/cub/util_allocator.cuh b/3rdparty/cub/cub/util_allocator.cuh index ff19cad55ce..42b020c9b1c 100644 --- a/3rdparty/cub/cub/util_allocator.cuh +++ b/3rdparty/cub/cub/util_allocator.cuh @@ -454,16 +454,6 @@ struct CachingDeviceAllocator if (error != cudaSuccess) { if (debug) CubLog("\tdevice %d failed to allocate %lld bytes for stream %lld", device, (long long) search_key.bytes, (long long) search_key.associated_stream); - - // if (search_key.bytes < cached_bytes[device]) { - // free all cached memory (for all devices), synchrionize and retry once - cudaDeviceSynchronize(); - cudaThreadSynchronize(); - FreeAllCached(); - cudaDeviceSynchronize(); - cudaThreadSynchronize(); - error = cudaMalloc(&search_key.d_ptr, search_key.bytes); - // } } if (CubDebug(error)) return error; diff --git a/Makefile b/Makefile index e2538d577a7..65703758574 100644 --- a/Makefile +++ b/Makefile @@ -571,7 +571,7 @@ $(DYNAMIC_NAME): $(OBJS)| $(LIB_BUILD_DIR) @ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_SONAME_SHORT); ln -s $(DYNAMIC_VERSIONED_NAME_SHORT) $(DYNAMIC_SONAME_SHORT) @ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_NAME_SHORT); ln -s $(DYNAMIC_SONAME_SHORT) $(DYNAMIC_NAME_SHORT) -$(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR) +$(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR) @ echo AR -o $@ $(Q)ar rcs $@ $(OBJS) diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp index c57d98a78be..4fdf2a465a8 100644 --- a/include/caffe/util/gpu_memory.hpp +++ b/include/caffe/util/gpu_memory.hpp @@ -77,6 +77,8 @@ class gpu_memory { cudaStream_t stream_; size_t size_; }; + static void update_dev_info(int device); + # endif private: @@ -89,8 +91,12 @@ class gpu_memory { #ifndef CPU_ONLY struct MemInfo { - size_t free; - size_t total; + MemInfo() { + free = total = flush_count = 0; + } + size_t free; + size_t total; + unsigned flush_count; }; static vector dev_info_; @@ -102,7 +108,6 @@ class gpu_memory { cudaStream_t stream = cudaStreamDefault); static void deallocate(pointer ptr, cudaStream_t = cudaStreamDefault); - static void registerStream(cudaStream_t stream); static void getInfo(size_t *free_mem, size_t *used_mem); private: diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp index 5e35ef43059..70c8a31c5ed 100644 --- a/src/caffe/util/gpu_memory.cpp +++ b/src/caffe/util/gpu_memory.cpp @@ -67,7 +67,30 @@ namespace caffe { CHECK((ptr) != NULL); switch (mode_) { case CubPool: - CUDA_CHECK(cubAlloc->DeviceAllocate(ptr, size, stream)); + if (cubAlloc->DeviceAllocate(ptr, size, stream) != cudaSuccess) + { + int cur_device; + CUDA_CHECK(cudaGetDevice(&cur_device)); + // free all cached memory (for all devices), synchrionize + cudaDeviceSynchronize(); + cudaThreadSynchronize(); + cubAlloc->FreeAllCached(); + cudaDeviceSynchronize(); + cudaThreadSynchronize(); + + // Refresh per-device saved values. + for (int i = 0; i < dev_info_.size(); i++) { + // only query devices that were initialized + if (dev_info_[i].total) { + update_dev_info(i); + // record which device caused cache flush + if (i == cur_device) + dev_info_[i].flush_count++; + } + } + // retry once + CUDA_CHECK(cubAlloc->DeviceAllocate(ptr, size, stream)); + } break; default: CUDA_CHECK(cudaMalloc(ptr, size)); @@ -89,46 +112,43 @@ namespace caffe { } } - void gpu_memory::registerStream(cudaStream_t stream) { - switch (mode_) { - case CubPool: - default: - break; - } - } - void gpu_memory::initMEM(const std::vector& gpus, PoolMode m) { - mode_ = m; + void gpu_memory::update_dev_info(int device) { int initial_device; - CUDA_CHECK(cudaGetDevice(&initial_device)); + if (device+1 > dev_info_.size()) + dev_info_.resize(device+1); + + CUDA_CHECK(cudaSetDevice(device)); + cudaDeviceProp props; + CUDA_CHECK(cudaGetDeviceProperties(&props, device)); + CUDA_CHECK(cudaMemGetInfo(&dev_info_[device].free, + &dev_info_[device].total)); + + if (debug_) { + std::cout << "cudaGetDeviceProperties: Mem = " + << props.totalGlobalMem <DeviceAllocate(ptr, size, stream) != cudaSuccess) - { + if (cubAlloc->DeviceAllocate(ptr, size, stream) != cudaSuccess) { int cur_device; CUDA_CHECK(cudaGetDevice(&cur_device)); // free all cached memory (for all devices), synchrionize @@ -148,8 +146,6 @@ namespace caffe { void gpu_memory::initMEM(const std::vector& gpus, PoolMode m) { mode_ = m; - int initial_device; - switch ( mode_ ) { case CubPool: try { @@ -165,11 +161,9 @@ namespace caffe { } catch (...) {} CHECK(cubAlloc); - for (int i = 0; i < gpus.size(); i++) { update_dev_info(gpus[i]); } - break; default: break;