Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -1527,6 +1527,12 @@ $(FILTERS_DIR)/nested_externs_%.a: $(BIN_DIR)/nested_externs.generator
@mkdir -p $(@D)
$(CURDIR)/$< -g nested_externs_$* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime

# Similarly, gpu_multi needs two different kernels to test compilation caching.
# Also requies user-context.
$(FILTERS_DIR)/gpu_multi_context_threaded_%.a: $(BIN_DIR)/gpu_multi_context_threaded.generator
@mkdir -p $(@D)
$(CURDIR)/$< -g gpu_multi_context_threaded_$* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-user_context

GEN_AOT_CXX_FLAGS=$(TEST_CXX_FLAGS) -Wno-unknown-pragmas
GEN_AOT_INCLUDES=-I$(INCLUDE_DIR) -I$(FILTERS_DIR) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common -I $(ROOT_DIR)/apps/support -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools
GEN_AOT_LD_FLAGS=$(COMMON_LD_FLAGS)
Expand Down Expand Up @@ -1622,11 +1628,31 @@ generator_aot_multitarget: $(BIN_DIR)/$(TARGET)/generator_aot_multitarget
HL_MULTITARGET_TEST_USE_NOBOUNDSQUERY_FEATURE=1 $(CURDIR)/$<
@-echo

# gpu_multi_context_threaded has additional deps to link in
$(BIN_DIR)/$(TARGET)/generator_aot_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp \
$(FILTERS_DIR)/gpu_multi_context_threaded_add.a \
$(FILTERS_DIR)/gpu_multi_context_threaded_mul.a \
$(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
@mkdir -p $(@D)
$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@

$(BIN_DIR)/$(TARGET)/generator_aotcpp_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp \
$(FILTERS_DIR)/gpu_multi_context_threaded_add.halide_generated.cpp \
$(FILTERS_DIR)/gpu_multi_context_threaded_mul.halide_generated.cpp \
$(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
@mkdir -p $(@D)
$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@

# nested externs doesn't actually contain a generator named
# "nested_externs", and has no internal tests in any case.
test_generator_nested_externs:
@echo "Skipping"

# gpu_multi actually contain a generator named
# "gpu_multi", and has no internal tests in any case.
test_generator_gpu_multi:
@echo "Skipping"

$(BUILD_DIR)/RunGenMain.o: $(ROOT_DIR)/tools/RunGenMain.cpp $(RUNTIME_EXPORTED_INCLUDES) $(ROOT_DIR)/tools/RunGen.h
@mkdir -p $(@D)
$(CXX) -c $< $(filter-out -g, $(TEST_CXX_FLAGS)) $(OPTIMIZE) -Os $(IMAGE_IO_CXX_FLAGS) -I$(INCLUDE_DIR) -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools -o $@
Expand Down
163 changes: 45 additions & 118 deletions src/runtime/cuda.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "HalideRuntimeCuda.h"
#include "device_buffer_utils.h"
#include "device_interface.h"
#include "gpu_context_common.h"
#include "mini_cuda.h"
#include "printer.h"
#include "scoped_mutex_lock.h"
Expand Down Expand Up @@ -239,43 +240,7 @@ class Context {
}
};

// Halide allocates a device API controlled pointer slot as part of
// each compiled module. The slot is used to store information to
// avoid having to reload/recompile kernel code on each call into a
// Halide filter. The cuda runtime uses this pointer to maintain a
// linked list of contexts into which the module has been loaded.
//
// A global list of all registered filters is also kept so all modules
// loaded on a given context can be unloaded and removed from the list
// when halide_device_release is called on a specific context.
//
// The registered_filters struct is not freed as it is pointed to by the
// Halide generated code. The module_state structs are freed.

struct module_state {
CUcontext context;
CUmodule module;
module_state *next;
};

struct registered_filters {
module_state *modules;
registered_filters *next;
};
WEAK registered_filters *filters_list = nullptr;
// This spinlock protects the above filters_list.
WEAK halide_mutex filters_list_lock;

WEAK module_state *find_module_for_context(const registered_filters *filters, CUcontext ctx) {
module_state *modules = filters->modules;
while (modules != nullptr) {
if (modules->context == ctx) {
return modules;
}
modules = modules->next;
}
return nullptr;
}
WEAK Halide::Internal::GPUCompilationCache<CUcontext, CUmodule> compilation_cache;

WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) {
// Initialize CUDA
Expand Down Expand Up @@ -505,6 +470,33 @@ WEAK bool validate_device_pointer(void *user_context, halide_buffer_t *buf, size
#endif
}

WEAK CUmodule compile_kernel(void *user_context, const char *ptx_src, int size) {
debug(user_context) << "CUDA: compile_kernel cuModuleLoadData " << (void *)ptx_src << ", " << size << " -> ";

CUjit_option options[] = {CU_JIT_MAX_REGISTERS};
unsigned int max_regs_per_thread = 64;

// A hack to enable control over max register count for
// testing. This should be surfaced in the schedule somehow
// instead.
char *regs = getenv("HL_CUDA_JIT_MAX_REGISTERS");
if (regs) {
max_regs_per_thread = atoi(regs);
}
void *optionValues[] = {(void *)(uintptr_t)max_regs_per_thread};
CUmodule loaded_module;
CUresult err = cuModuleLoadDataEx(&loaded_module, ptx_src, 1, options, optionValues);

if (err != CUDA_SUCCESS) {
error(user_context) << "CUDA: cuModuleLoadData failed: "
<< get_error_name(err);
return nullptr;
} else {
debug(user_context) << (void *)(loaded_module) << "\n";
}
return loaded_module;
}

} // namespace Cuda
} // namespace Internal
} // namespace Runtime
Expand All @@ -526,54 +518,12 @@ WEAK int halide_cuda_initialize_kernels(void *user_context, void **state_ptr, co
uint64_t t_before = halide_current_time_ns(user_context);
#endif

halide_assert(user_context, &filters_list_lock != nullptr);
{
ScopedMutexLock spinlock(&filters_list_lock);

// Create the state object if necessary. This only happens once, regardless
// of how many times halide_initialize_kernels/halide_release is called.
// halide_release traverses this list and releases the module objects, but
// it does not modify the list nodes created/inserted here.
registered_filters **filters = (registered_filters **)state_ptr;
if (!(*filters)) {
*filters = (registered_filters *)malloc(sizeof(registered_filters));
(*filters)->modules = nullptr;
(*filters)->next = filters_list;
filters_list = *filters;
}

// Create the module itself if necessary.
module_state *loaded_module = find_module_for_context(*filters, ctx.context);
if (loaded_module == nullptr) {
loaded_module = (module_state *)malloc(sizeof(module_state));
debug(user_context) << " cuModuleLoadData " << (void *)ptx_src << ", " << size << " -> ";

CUjit_option options[] = {CU_JIT_MAX_REGISTERS};
unsigned int max_regs_per_thread = 64;

// A hack to enable control over max register count for
// testing. This should be surfaced in the schedule somehow
// instead.
char *regs = getenv("HL_CUDA_JIT_MAX_REGISTERS");
if (regs) {
max_regs_per_thread = atoi(regs);
}
void *optionValues[] = {(void *)(uintptr_t)max_regs_per_thread};
CUresult err = cuModuleLoadDataEx(&loaded_module->module, ptx_src, 1, options, optionValues);

if (err != CUDA_SUCCESS) {
free(loaded_module);
error(user_context) << "CUDA: cuModuleLoadData failed: "
<< get_error_name(err);
return err;
} else {
debug(user_context) << (void *)(loaded_module->module) << "\n";
}
loaded_module->context = ctx.context;
loaded_module->next = (*filters)->modules;
(*filters)->modules = loaded_module;
}
} // spinlock
CUmodule loaded_module;
if (!compilation_cache.kernel_state_setup(user_context, state_ptr, ctx.context, loaded_module,
compile_kernel, user_context, ptx_src, size)) {
return halide_error_code_generic_error;
}
halide_assert(user_context, loaded_module != nullptr);

#ifdef DEBUG_RUNTIME
uint64_t t_after = halide_current_time_ns(user_context);
Expand Down Expand Up @@ -704,7 +654,7 @@ WEAK int halide_cuda_device_release(void *user_context) {
<< "CUDA: halide_cuda_device_release (user_context: " << user_context << ")\n";

// If we haven't even loaded libcuda, don't load it just to quit.
if (!lib_cuda) {
if (!cuInit) {
return 0;
}

Expand All @@ -728,34 +678,7 @@ WEAK int halide_cuda_device_release(void *user_context) {
// Dump the contents of the free list, ignoring errors.
halide_cuda_release_unused_device_allocations(user_context);

{
ScopedMutexLock spinlock(&filters_list_lock);

// Unload the modules attached to this context. Note that the list
// nodes themselves are not freed, only the module objects are
// released. Subsequent calls to halide_init_kernels might re-create
// the program object using the same list node to store the module
// object.
registered_filters *filters = filters_list;
while (filters) {
module_state **prev_ptr = &filters->modules;
module_state *loaded_module = filters->modules;
while (loaded_module != nullptr) {
if (loaded_module->context == ctx) {
debug(user_context) << " cuModuleUnload " << loaded_module->module << "\n";
err = cuModuleUnload(loaded_module->module);
halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
*prev_ptr = loaded_module->next;
free(loaded_module);
loaded_module = *prev_ptr;
} else {
loaded_module = loaded_module->next;
prev_ptr = &loaded_module->next;
}
}
filters = filters->next;
}
} // spinlock
compilation_cache.delete_context(user_context, ctx, cuModuleUnload);

CUcontext old_ctx;
cuCtxPopCurrent(&old_ctx);
Expand Down Expand Up @@ -919,12 +842,15 @@ WEAK int cuda_do_multidimensional_copy(void *user_context, const device_copy &c,
<< (void *)src << " -> " << (void *)dst << ", " << c.chunk_size << " bytes\n";
if (!from_host && to_host) {
debug(user_context) << "cuMemcpyDtoH(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
copy_name = "cuMemcpyDtoH";
err = cuMemcpyDtoH((void *)dst, (CUdeviceptr)src, c.chunk_size);
} else if (from_host && !to_host) {
debug(user_context) << "cuMemcpyHtoD(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
copy_name = "cuMemcpyHtoD";
err = cuMemcpyHtoD((CUdeviceptr)dst, (void *)src, c.chunk_size);
} else if (!from_host && !to_host) {
debug(user_context) << "cuMemcpyDtoD(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
copy_name = "cuMemcpyDtoD";
err = cuMemcpyDtoD((CUdeviceptr)dst, (CUdeviceptr)src, c.chunk_size);
} else if (dst != src) {
debug(user_context) << "memcpy(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
Expand Down Expand Up @@ -1133,9 +1059,9 @@ WEAK int halide_cuda_run(void *user_context,
#endif

halide_assert(user_context, state_ptr);
module_state *loaded_module = find_module_for_context((registered_filters *)state_ptr, ctx.context);
halide_assert(user_context, loaded_module != nullptr);
CUmodule mod = loaded_module->module;
CUmodule mod = nullptr;
bool found_module = compilation_cache.lookup(ctx.context, state_ptr, mod);
halide_assert(user_context, found_module && mod != nullptr);
debug(user_context) << "Got module " << mod << "\n";
halide_assert(user_context, mod);
CUfunction f;
Expand Down Expand Up @@ -1264,7 +1190,7 @@ WEAK const halide_device_interface_t *halide_cuda_device_interface() {
}

WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *minor) {
if (!lib_cuda) {
if (!lib_cuda && !cuInit) {
// If cuda can't be found, we want to return 0, 0 and it's not
// considered an error. So we should be very careful about
// looking for libcuda without tripping any errors in the rest
Expand Down Expand Up @@ -1313,6 +1239,7 @@ WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *min

namespace {
WEAK __attribute__((destructor)) void halide_cuda_cleanup() {
compilation_cache.release_all(nullptr, cuModuleUnload);
halide_cuda_device_release(nullptr);
}
} // namespace
Expand Down
Loading