Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 13 additions & 23 deletions 3rdparty/cub/cub/util_allocator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -408,10 +408,10 @@ struct CachingDeviceAllocator
&& (block_itr->device == device)
&& (block_itr->bin == search_key.bin)) {

// use special rule for the last ("exact size") bin: set max memory overuse to 1/8th
// use special rule for the last ("exact size") bin: set max memory overuse to 1/8th
if (search_key.bin == (unsigned int) -1 && (block_itr->bytes - search_key.bytes)*8UL > search_key.bytes)
break;

cudaStream_t prev_stream = block_itr->associated_stream;
if ((active_stream == prev_stream)
|| (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
Expand Down Expand Up @@ -454,22 +454,12 @@ struct CachingDeviceAllocator
if (error != cudaSuccess) {
if (debug) CubLog("\tdevice %d failed to allocate %lld bytes for stream %lld",
device, (long long) search_key.bytes, (long long) search_key.associated_stream);

// if (search_key.bytes < cached_bytes[device]) {
// free all cached memory (for all devices), synchrionize and retry once
cudaDeviceSynchronize();
cudaThreadSynchronize();
FreeAllCached();
cudaDeviceSynchronize();
cudaThreadSynchronize();
error = cudaMalloc(&search_key.d_ptr, search_key.bytes);
// }
}
if (CubDebug(error))
if (CubDebug(error))
return error;
if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
return error;

// Insert into live blocks
Lock(&spin_lock);
live_blocks.insert(search_key);
Expand Down Expand Up @@ -548,30 +538,30 @@ struct CachingDeviceAllocator

// Lock
Lock(&spin_lock);

// Find corresponding block descriptor
BusyBlocks::iterator block_itr = live_blocks.find(search_key);
if (block_itr != live_blocks.end()) {
// Remove from live blocks
search_key = *block_itr;
live_blocks.erase(block_itr);
live_blocks.erase(block_itr);
cached_bytes[device].busy -= search_key.bytes;

// Check if we should keep the returned allocation
if (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)
{
{
// Insert returned allocation into free blocks
cached_blocks.insert(search_key);
cached_bytes[device].free += search_key.bytes;
recached = true;
if (debug) {
if (debug) {
CubLog("\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
(long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy);
}
}
}

Unlock(&spin_lock);

if (recached) {
Expand All @@ -585,7 +575,7 @@ struct CachingDeviceAllocator
return error;
if (CubDebug(error = cudaSetDevice(device))) return error;
}

// Actually free device memory
if (CubDebug(error = cudaFree(d_ptr))) return error;
if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,7 @@ $(DYNAMIC_NAME): $(OBJS)| $(LIB_BUILD_DIR)
@ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_SONAME_SHORT); ln -s $(DYNAMIC_VERSIONED_NAME_SHORT) $(DYNAMIC_SONAME_SHORT)
@ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_NAME_SHORT); ln -s $(DYNAMIC_SONAME_SHORT) $(DYNAMIC_NAME_SHORT)

$(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR)
$(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR)
@ echo AR -o $@
$(Q)ar rcs $@ $(OBJS)

Expand Down
11 changes: 8 additions & 3 deletions include/caffe/util/gpu_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ class gpu_memory {
cudaStream_t stream_;
size_t size_;
};
static void update_dev_info(int device);

# endif

private:
Expand All @@ -89,8 +91,12 @@ class gpu_memory {

#ifndef CPU_ONLY
struct MemInfo {
size_t free;
size_t total;
MemInfo() {
free = total = flush_count = 0;
}
size_t free;
size_t total;
unsigned flush_count;
};

static vector<MemInfo> dev_info_;
Expand All @@ -102,7 +108,6 @@ class gpu_memory {
cudaStream_t stream = cudaStreamDefault);
static void deallocate(pointer ptr, cudaStream_t = cudaStreamDefault);

static void registerStream(cudaStream_t stream);
static void getInfo(size_t *free_mem, size_t *used_mem);

private:
Expand Down
99 changes: 60 additions & 39 deletions src/caffe/util/gpu_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,12 @@ namespace caffe {
return "No GPU: CPU Only Memory";
}
#else

void gpu_memory::init(const std::vector<int>& gpus,
PoolMode m, bool debug) {
debug_ = debug;
bool debug_env = (getenv("DEBUG_GPU_MEM") != 0);
debug_ = debug || debug_env;

if (gpus.size() <= 0) {
// should we report an error here ?
m = gpu_memory::NoPool;
Expand Down Expand Up @@ -63,7 +66,29 @@ namespace caffe {
CHECK((ptr) != NULL);
switch (mode_) {
case CubPool:
CUDA_CHECK(cubAlloc->DeviceAllocate(ptr, size, stream));
if (cubAlloc->DeviceAllocate(ptr, size, stream) != cudaSuccess) {
int cur_device;
CUDA_CHECK(cudaGetDevice(&cur_device));
// free all cached memory (for all devices), synchrionize
cudaDeviceSynchronize();
cudaThreadSynchronize();
cubAlloc->FreeAllCached();
cudaDeviceSynchronize();
cudaThreadSynchronize();

// Refresh per-device saved values.
for (int i = 0; i < dev_info_.size(); i++) {
// only query devices that were initialized
if (dev_info_[i].total) {
update_dev_info(i);
// record which device caused cache flush
if (i == cur_device)
dev_info_[i].flush_count++;
}
}
// retry once
CUDA_CHECK(cubAlloc->DeviceAllocate(ptr, size, stream));
}
break;
default:
CUDA_CHECK(cudaMalloc(ptr, size));
Expand All @@ -85,47 +110,42 @@ namespace caffe {
}
}

void gpu_memory::registerStream(cudaStream_t stream) {
switch (mode_) {
case CubPool:
default:
break;
}
}

void gpu_memory::initMEM(const std::vector<int>& gpus, PoolMode m) {
mode_ = m;
void gpu_memory::update_dev_info(int device) {
int initial_device;

CUDA_CHECK(cudaGetDevice(&initial_device));

for (int i = 0; i < gpus.size(); i++) {
int cur_device = gpus[i];
if (cur_device+1 > dev_info_.size())
dev_info_.resize(cur_device+1);

CUDA_CHECK(cudaSetDevice(gpus[i]));
cudaDeviceProp props;
CUDA_CHECK(cudaGetDeviceProperties(&props, cur_device));
CUDA_CHECK(cudaMemGetInfo(&dev_info_[cur_device].free,
&dev_info_[cur_device].total));

if (debug_) {
std::cout << "cudaGetDeviceProperties: Mem = "
<< props.totalGlobalMem <<std::endl;
std::cout << "cudaMemGetInfo_[" << cur_device
<<": Free= " << dev_info_[cur_device].free
<< " Total= " << dev_info_[cur_device].total << std::endl;
}

// make sure we don't ask for more that total device memory
dev_info_[i].free = std::min(dev_info_[cur_device].total,
dev_info_[cur_device].free);
dev_info_[i].free = std::min(props.totalGlobalMem,
dev_info_[cur_device].free);
if (device+1 > dev_info_.size())
dev_info_.resize(device+1);

CUDA_CHECK(cudaSetDevice(device));
cudaDeviceProp props;
CUDA_CHECK(cudaGetDeviceProperties(&props, device));
CUDA_CHECK(cudaMemGetInfo(&dev_info_[device].free,
&dev_info_[device].total));

if (debug_) {
std::cout << "cudaGetDeviceProperties: Mem = "
<< props.totalGlobalMem <<std::endl;
std::cout << "cudaMemGetInfo_[" << device
<<": Free= " << dev_info_[device].free
<< " Total= " << dev_info_[device].total << std::endl;
}

// make sure we don't have more that total device memory
dev_info_[device].total = std::min(props.totalGlobalMem,
dev_info_[device].total);

// here we are adding existing 'busy' allocations to CUDA free memory
dev_info_[device].free =
std::min(dev_info_[device].total,
dev_info_[device].free
+ cubAlloc->cached_bytes[device].busy);
CUDA_CHECK(cudaSetDevice(initial_device));
}

void gpu_memory::initMEM(const std::vector<int>& gpus, PoolMode m) {
mode_ = m;
switch ( mode_ ) {
case CubPool:
try {
Expand All @@ -141,12 +161,13 @@ namespace caffe {
}
catch (...) {}
CHECK(cubAlloc);
for (int i = 0; i < gpus.size(); i++) {
update_dev_info(gpus[i]);
}
break;
default:
break;
}

CUDA_CHECK(cudaSetDevice(initial_device));
}

const char* gpu_memory::getPoolName() {
Expand All @@ -167,7 +188,7 @@ namespace caffe {
// Free memory is initial free memory minus outstanding allocations.
// Assuming we only allocate via gpu_memory since its constructon.
*free_mem = dev_info_[cur_device].free -
cubAlloc->cached_bytes[cur_device].busy;
cubAlloc->cached_bytes[cur_device].busy;
break;
default:
CUDA_CHECK(cudaMemGetInfo(free_mem, total_mem));
Expand Down