Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 17 additions & 12 deletions 3rdparty/cub/cub/util_allocator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,11 @@ struct CachingDeviceAllocator
while ( (block_itr != cached_blocks.end())
&& (block_itr->device == device)
&& (block_itr->bin == search_key.bin)) {

// use special rule for the last ("exact size") bin: set max memory overuse to 1/8th
if (search_key.bin == (unsigned int) -1 && (block_itr->bytes - search_key.bytes)*8UL > search_key.bytes)
break;

cudaStream_t prev_stream = block_itr->associated_stream;
if ((active_stream == prev_stream)
|| (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
Expand Down Expand Up @@ -460,11 +465,11 @@ struct CachingDeviceAllocator
error = cudaMalloc(&search_key.d_ptr, search_key.bytes);
// }
}
if (CubDebug(error))
if (CubDebug(error))
return error;
if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
return error;

// Insert into live blocks
Lock(&spin_lock);
live_blocks.insert(search_key);
Expand Down Expand Up @@ -543,30 +548,30 @@ struct CachingDeviceAllocator

// Lock
Lock(&spin_lock);

// Find corresponding block descriptor
BusyBlocks::iterator block_itr = live_blocks.find(search_key);
if (block_itr != live_blocks.end()) {
// Remove from live blocks
search_key = *block_itr;
live_blocks.erase(block_itr);
live_blocks.erase(block_itr);
cached_bytes[device].busy -= search_key.bytes;

// Check if we should keep the returned allocation
if (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)
{
{
// Insert returned allocation into free blocks
cached_blocks.insert(search_key);
cached_bytes[device].free += search_key.bytes;
recached = true;
if (debug) {
if (debug) {
CubLog("\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
(long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy);
}
}
}

Unlock(&spin_lock);

if (recached) {
Expand All @@ -580,7 +585,7 @@ struct CachingDeviceAllocator
return error;
if (CubDebug(error = cudaSetDevice(device))) return error;
}

// Actually free device memory
if (CubDebug(error = cudaFree(d_ptr))) return error;
if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
Expand Down Expand Up @@ -662,7 +667,7 @@ struct CachingDeviceAllocator
cached_blocks.erase(begin);

if (debug) CubLog("\tdevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].free);
current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].busy);
}

Unlock(&spin_lock);
Expand Down
14 changes: 10 additions & 4 deletions include/caffe/util/gpu_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,17 @@ class gpu_memory {
static void init(const std::vector<int>&, PoolMode, bool);
static void destroy();

static bool initialized_;
static PoolMode mode_;
static size_t poolsize_;
static bool debug_;
static bool initialized_;
static PoolMode mode_;
static bool debug_;

#ifndef CPU_ONLY
struct MemInfo {
size_t free;
size_t total;
};

static vector<MemInfo> dev_info_;

public:
typedef void* pointer;
Expand Down
52 changes: 27 additions & 25 deletions src/caffe/util/gpu_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ namespace caffe {

#ifndef CPU_ONLY // CPU-only Caffe.
static cub::CachingDeviceAllocator* cubAlloc = 0;
vector<gpu_memory::MemInfo> gpu_memory::dev_info_;
#endif

gpu_memory::PoolMode gpu_memory::mode_ = gpu_memory::NoPool;
size_t gpu_memory::poolsize_ = 0;
bool gpu_memory::debug_ = false;

#ifdef CPU_ONLY // CPU-only Caffe.
Expand Down Expand Up @@ -43,8 +43,7 @@ namespace caffe {
}
if (debug)
std::cout << "gpu_memory initialized with "
<< getPoolName() << ". Poolsize = "
<< (1.0*poolsize_)/(1024.0*1024.0*1024.0) << " G." << std::endl;
<< getPoolName() << std::endl;
}

void gpu_memory::destroy() {
Expand Down Expand Up @@ -100,43 +99,44 @@ namespace caffe {

CUDA_CHECK(cudaGetDevice(&initial_device));


for (int i = 0; i < gpus.size(); i++) {
int cur_device = gpus[i];
if (cur_device+1 > dev_info_.size())
dev_info_.resize(cur_device+1);

CUDA_CHECK(cudaSetDevice(gpus[i]));
size_t free_mem, total_mem;
cudaDeviceProp props;
CUDA_CHECK(cudaGetDeviceProperties(&props, gpus[i]));
CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem));
CUDA_CHECK(cudaGetDeviceProperties(&props, cur_device));
CUDA_CHECK(cudaMemGetInfo(&dev_info_[cur_device].free,
&dev_info_[cur_device].total));

if (debug_) {
std::cout << "cudaGetDeviceProperties: Mem = "
<< props.totalGlobalMem <<std:: endl;
std::cout << "cudaMemGetInfo: Free= " << free_mem
<< " Total= " << total_mem << std::endl;
<< props.totalGlobalMem <<std::endl;
std::cout << "cudaMemGetInfo_[" << cur_device
<<": Free= " << dev_info_[cur_device].free
<< " Total= " << dev_info_[cur_device].total << std::endl;
}

// make sure we don't ask for more that total device memory
free_mem = std::min(total_mem, free_mem);
free_mem = size_t(0.95*std::min(props.totalGlobalMem, free_mem));
// find out the smallest GPU size
if (poolsize_ == 0 || poolsize_ > free_mem)
poolsize_ = free_mem;
}
dev_info_[i].free = std::min(dev_info_[cur_device].total,
dev_info_[cur_device].free);
dev_info_[i].free = std::min(props.totalGlobalMem,
dev_info_[cur_device].free);
}


switch ( mode_ ) {
case CubPool:
try {
// if you are paranoid, that doesn't mean they are not after you :)
// just in case someone installed 'no cleanup' arena before
delete cubAlloc;

cubAlloc = new cub::CachingDeviceAllocator( 2, // defaults
cubAlloc = new cub::CachingDeviceAllocator( 2,
6,
32, // largest
// cached
// allocation
// becomes
// 2^32 here
poolsize_,
16,
(size_t)-1,
false,
debug_);
}
Expand Down Expand Up @@ -164,13 +164,15 @@ namespace caffe {
case CubPool:
int cur_device;
CUDA_CHECK(cudaGetDevice(&cur_device));
*total_mem = poolsize_;
*total_mem = dev_info_[cur_device].total;
// Free memory is initial free memory minus outstanding allocations.
// Assuming we only allocate via gpu_memory since its constructon.
*free_mem = poolsize_ - cubAlloc->cached_bytes[cur_device].busy;
*free_mem = dev_info_[cur_device].free -
cubAlloc->cached_bytes[cur_device].busy;
break;
default:
CUDA_CHECK(cudaMemGetInfo(free_mem, total_mem));
break;
}
}
#endif // CPU_ONLY
Expand Down