Skip to content
6 changes: 5 additions & 1 deletion src/CodeGen_PTX_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,7 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
if (debug::debug_level() >= 2) {
dump();
}
debug(2) << "Done with CodeGen_PTX_Dev::compile_to_src";
debug(2) << "Done with CodeGen_PTX_Dev::compile_to_src\n";

debug(1) << "PTX kernel:\n"
<< outstr.c_str() << "\n";
Expand All @@ -737,11 +737,15 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
f.close();

string cmd = "ptxas --gpu-name " + mcpu() + " " + ptx.pathname() + " -o " + sass.pathname();
debug(2) << "ptxas cmdline: (" << cmd << ")\n";
if (system(cmd.c_str()) == 0) {
cmd = "nvdisasm " + sass.pathname();
debug(2) << "nvdisasm cmdline: (" << cmd << ")\n";
int ret = system(cmd.c_str());
(void)ret; // Don't care if it fails
}
ptx.detach();
sass.detach();

// Note: It works to embed the contents of the .sass file in
// the buffer instead of the ptx source, and this could help
Expand Down
19 changes: 15 additions & 4 deletions src/Debug.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,24 @@
namespace Halide {
namespace Internal {

namespace {

int cached_debug_level = ([]() -> int {
std::string lvl = get_env_variable("HL_DEBUG_CODEGEN");
return !lvl.empty() ? atoi(lvl.c_str()) : 0;
})();

} // namespace

int debug::debug_level() {
static int cached_debug_level = ([]() -> int {
std::string lvl = get_env_variable("HL_DEBUG_CODEGEN");
return !lvl.empty() ? atoi(lvl.c_str()) : 0;
})();
return cached_debug_level;
}

int debug::set_debug_level(int d) {
int old_level = cached_debug_level;
cached_debug_level = d;
return old_level;
}

} // namespace Internal
} // namespace Halide
1 change: 1 addition & 0 deletions src/Debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ class debug {
}

static int debug_level();
static int set_debug_level(int d);
};

} // namespace Internal
Expand Down
2 changes: 1 addition & 1 deletion src/Pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -882,7 +882,7 @@ struct JITFuncCallContext {
std::to_string(exit_status) +
" but halide_error was never called.\n");
}
halide_runtime_error << output;
halide_runtime_error << "ZZZ(" << output << ")ZZZ";
error_buffer.end = 0;
}
}
Expand Down
1 change: 1 addition & 0 deletions src/Target.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ Target::Feature calculate_host_cuda_capability(Target t) {
int err = interface->compute_capability(nullptr, &major, &minor);
internal_assert(err == 0) << "Failed to query cuda compute capability\n";
int ver = major * 10 + minor;
Internal::debug(0) <<"CUDA capability is " << major << "." << minor << " -> " << ver << "\n";
if (ver < 30) {
return Target::FeatureEnd;
} else if (ver < 32) {
Expand Down
26 changes: 20 additions & 6 deletions src/runtime/cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -684,6 +684,11 @@ WEAK int halide_cuda_device_release(void *user_context) {
err = cuCtxPushCurrent(ctx);
if (err != CUDA_SUCCESS) {
err = cuCtxSynchronize();
if (err != CUDA_SUCCESS) {
debug(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_device_release): "
<< get_error_name((CUresult)err);
// do not return!
}
}
halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);

Expand Down Expand Up @@ -1020,13 +1025,18 @@ WEAK int halide_cuda_device_sync(void *user_context, struct halide_buffer_t *) {
error(user_context) << "CUDA: In halide_cuda_device_sync, halide_cuda_get_stream returned " << result << "\n";
}
err = cuStreamSynchronize(stream);
if (err != CUDA_SUCCESS) {
error(user_context) << "CUDA: cuStreamSynchronize failed (halide_cuda_device_sync): "
<< get_error_name(err);
return err;
}
} else {
err = cuCtxSynchronize();
}
if (err != CUDA_SUCCESS) {
error(user_context) << "CUDA: cuCtxSynchronize failed: "
<< get_error_name(err);
return err;
if (err != CUDA_SUCCESS) {
error(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_device_sync): "
<< get_error_name(err);
return err;
}
}

#ifdef DEBUG_RUNTIME
Expand Down Expand Up @@ -1057,6 +1067,8 @@ WEAK int halide_cuda_run(void *user_context,
CUresult err;
Context ctx(user_context);
if (ctx.error != CUDA_SUCCESS) {
error(user_context) << "CUDA: Context failed: "
<< get_error_name((CUresult)ctx.error);
return ctx.error;
}

Expand Down Expand Up @@ -1136,13 +1148,15 @@ WEAK int halide_cuda_run(void *user_context,
#ifdef DEBUG_RUNTIME
err = cuCtxSynchronize();
if (err != CUDA_SUCCESS) {
error(user_context) << "CUDA: cuCtxSynchronize failed: "
error(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_run): "
<< get_error_name(err);
return err;
}
uint64_t t_after = halide_current_time_ns(user_context);
debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
#endif

debug(user_context) << "CUDA: halide_cuda_run succeeds!\n";
return 0;
}

Expand Down
45 changes: 39 additions & 6 deletions test/correctness/gpu_dynamic_shared.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,53 @@
using namespace Halide;

int main(int argc, char **argv) {
Target t = get_jit_target_from_environment();
if (!t.has_gpu_feature()) {
printf("[SKIP] No GPU target enabled.\n");
return 0;
}
Target t = Target("host-cuda");
// get_jit_target_from_environment();
// if (!t.has_gpu_feature()) {
// printf("[SKIP] No GPU target enabled.\n");
// return 0;
// }

if (t.has_feature(Target::OpenGLCompute)) {
printf("[SKIP] Skipping test for OpenGLCompute, as it does not support dynamically-sized shared memory\n");
return 0;
}

if (t.has_feature(Target::CUDA)) {
t.set_feature(Target::Debug);
t.set_feature(Target::DisableLLVMLoopOpt);
Halide::Internal::debug::set_debug_level(2);
}

// Check dynamic allocations per-block and per-thread into both
// shared and global
#if 1
Func f("f"), g("g");
Var x("x"), xi("xi");

f(x) = x;
g(x) = f(x) + f(2 * x);

g.gpu_tile(x, xi, 16);
f.compute_at(g, xi);

f.store_in(MemoryType::GPUShared);

// The amount of shared/heap memory required varies with x
Buffer<int> out = g.realize({100}, t);
for (int x = 0; x < 100; x++) {
int correct = 3 * x;
if (out(x) != correct) {
printf("out(%d) = %d instead of %d\n",
x, out(x), correct);
return -1;
}
}
#else
for (int per_thread = 0; per_thread < 2; per_thread++) {
for (auto memory_type : {MemoryType::GPUShared, MemoryType::Heap}) {
printf("Testing: per_thread=%d, memory_type=%d\n", per_thread, (int)memory_type);

Func f("f"), g("g");
Var x("x"), xi("xi");

Expand All @@ -35,7 +67,7 @@ int main(int argc, char **argv) {
f.store_in(memory_type);

// The amount of shared/heap memory required varies with x
Buffer<int> out = g.realize({100});
Buffer<int> out = g.realize({100}, t);
for (int x = 0; x < 100; x++) {
int correct = 3 * x;
if (out(x) != correct) {
Expand All @@ -46,6 +78,7 @@ int main(int argc, char **argv) {
}
}
}
#endif

printf("Success!\n");
return 0;
Expand Down