diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index 3b354e51e342..09ca05d65218 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -718,7 +718,7 @@ vector CodeGen_PTX_Dev::compile_to_src() { if (debug::debug_level() >= 2) { dump(); } - debug(2) << "Done with CodeGen_PTX_Dev::compile_to_src"; + debug(2) << "Done with CodeGen_PTX_Dev::compile_to_src\n"; debug(1) << "PTX kernel:\n" << outstr.c_str() << "\n"; @@ -737,11 +737,15 @@ vector CodeGen_PTX_Dev::compile_to_src() { f.close(); string cmd = "ptxas --gpu-name " + mcpu() + " " + ptx.pathname() + " -o " + sass.pathname(); + debug(2) << "ptxas cmdline: (" << cmd << ")\n"; if (system(cmd.c_str()) == 0) { cmd = "nvdisasm " + sass.pathname(); + debug(2) << "nvdisasm cmdline: (" << cmd << ")\n"; int ret = system(cmd.c_str()); (void)ret; // Don't care if it fails } +ptx.detach(); +sass.detach(); // Note: It works to embed the contents of the .sass file in // the buffer instead of the ptx source, and this could help diff --git a/src/Debug.cpp b/src/Debug.cpp index 22d26e83cc31..11b2c3358fb6 100644 --- a/src/Debug.cpp +++ b/src/Debug.cpp @@ -4,13 +4,24 @@ namespace Halide { namespace Internal { +namespace { + +int cached_debug_level = ([]() -> int { + std::string lvl = get_env_variable("HL_DEBUG_CODEGEN"); + return !lvl.empty() ? atoi(lvl.c_str()) : 0; +})(); + +} // namespace + int debug::debug_level() { - static int cached_debug_level = ([]() -> int { - std::string lvl = get_env_variable("HL_DEBUG_CODEGEN"); - return !lvl.empty() ? atoi(lvl.c_str()) : 0; - })(); return cached_debug_level; } +int debug::set_debug_level(int d) { + int old_level = cached_debug_level; + cached_debug_level = d; + return old_level; +} + } // namespace Internal } // namespace Halide diff --git a/src/Debug.h b/src/Debug.h index fadb5b4066ac..9f005b1f0805 100644 --- a/src/Debug.h +++ b/src/Debug.h @@ -63,6 +63,7 @@ class debug { } static int debug_level(); + static int set_debug_level(int d); }; } // namespace Internal diff --git a/src/Pipeline.cpp b/src/Pipeline.cpp index f552cee81259..deac3bea7692 100644 --- a/src/Pipeline.cpp +++ b/src/Pipeline.cpp @@ -882,7 +882,7 @@ struct JITFuncCallContext { std::to_string(exit_status) + " but halide_error was never called.\n"); } - halide_runtime_error << output; + halide_runtime_error << "ZZZ(" << output << ")ZZZ"; error_buffer.end = 0; } } diff --git a/src/Target.cpp b/src/Target.cpp index f154a535ff1a..66c249f8526d 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -241,6 +241,7 @@ Target::Feature calculate_host_cuda_capability(Target t) { int err = interface->compute_capability(nullptr, &major, &minor); internal_assert(err == 0) << "Failed to query cuda compute capability\n"; int ver = major * 10 + minor; + Internal::debug(0) <<"CUDA capability is " << major << "." << minor << " -> " << ver << "\n"; if (ver < 30) { return Target::FeatureEnd; } else if (ver < 32) { diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp index 0846bd9b5c40..12f3cd404ceb 100644 --- a/src/runtime/cuda.cpp +++ b/src/runtime/cuda.cpp @@ -684,6 +684,11 @@ WEAK int halide_cuda_device_release(void *user_context) { err = cuCtxPushCurrent(ctx); if (err != CUDA_SUCCESS) { err = cuCtxSynchronize(); + if (err != CUDA_SUCCESS) { + debug(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_device_release): " + << get_error_name((CUresult)err); + // do not return! + } } halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED); @@ -1020,13 +1025,18 @@ WEAK int halide_cuda_device_sync(void *user_context, struct halide_buffer_t *) { error(user_context) << "CUDA: In halide_cuda_device_sync, halide_cuda_get_stream returned " << result << "\n"; } err = cuStreamSynchronize(stream); + if (err != CUDA_SUCCESS) { + error(user_context) << "CUDA: cuStreamSynchronize failed (halide_cuda_device_sync): " + << get_error_name(err); + return err; + } } else { err = cuCtxSynchronize(); - } - if (err != CUDA_SUCCESS) { - error(user_context) << "CUDA: cuCtxSynchronize failed: " - << get_error_name(err); - return err; + if (err != CUDA_SUCCESS) { + error(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_device_sync): " + << get_error_name(err); + return err; + } } #ifdef DEBUG_RUNTIME @@ -1057,6 +1067,8 @@ WEAK int halide_cuda_run(void *user_context, CUresult err; Context ctx(user_context); if (ctx.error != CUDA_SUCCESS) { + error(user_context) << "CUDA: Context failed: " + << get_error_name((CUresult)ctx.error); return ctx.error; } @@ -1136,13 +1148,15 @@ WEAK int halide_cuda_run(void *user_context, #ifdef DEBUG_RUNTIME err = cuCtxSynchronize(); if (err != CUDA_SUCCESS) { - error(user_context) << "CUDA: cuCtxSynchronize failed: " + error(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_run): " << get_error_name(err); return err; } uint64_t t_after = halide_current_time_ns(user_context); debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n"; #endif + + debug(user_context) << "CUDA: halide_cuda_run succeeds!\n"; return 0; } diff --git a/test/correctness/gpu_dynamic_shared.cpp b/test/correctness/gpu_dynamic_shared.cpp index d43386f05980..dc807c620643 100644 --- a/test/correctness/gpu_dynamic_shared.cpp +++ b/test/correctness/gpu_dynamic_shared.cpp @@ -4,21 +4,53 @@ using namespace Halide; int main(int argc, char **argv) { - Target t = get_jit_target_from_environment(); - if (!t.has_gpu_feature()) { - printf("[SKIP] No GPU target enabled.\n"); - return 0; - } + Target t = Target("host-cuda"); + // get_jit_target_from_environment(); + // if (!t.has_gpu_feature()) { + // printf("[SKIP] No GPU target enabled.\n"); + // return 0; + // } if (t.has_feature(Target::OpenGLCompute)) { printf("[SKIP] Skipping test for OpenGLCompute, as it does not support dynamically-sized shared memory\n"); return 0; } + if (t.has_feature(Target::CUDA)) { + t.set_feature(Target::Debug); + t.set_feature(Target::DisableLLVMLoopOpt); + Halide::Internal::debug::set_debug_level(2); + } + // Check dynamic allocations per-block and per-thread into both // shared and global +#if 1 + Func f("f"), g("g"); + Var x("x"), xi("xi"); + + f(x) = x; + g(x) = f(x) + f(2 * x); + + g.gpu_tile(x, xi, 16); + f.compute_at(g, xi); + + f.store_in(MemoryType::GPUShared); + + // The amount of shared/heap memory required varies with x + Buffer out = g.realize({100}, t); + for (int x = 0; x < 100; x++) { + int correct = 3 * x; + if (out(x) != correct) { + printf("out(%d) = %d instead of %d\n", + x, out(x), correct); + return -1; + } + } +#else for (int per_thread = 0; per_thread < 2; per_thread++) { for (auto memory_type : {MemoryType::GPUShared, MemoryType::Heap}) { + printf("Testing: per_thread=%d, memory_type=%d\n", per_thread, (int)memory_type); + Func f("f"), g("g"); Var x("x"), xi("xi"); @@ -35,7 +67,7 @@ int main(int argc, char **argv) { f.store_in(memory_type); // The amount of shared/heap memory required varies with x - Buffer out = g.realize({100}); + Buffer out = g.realize({100}, t); for (int x = 0; x < 100; x++) { int correct = 3 * x; if (out(x) != correct) { @@ -46,6 +78,7 @@ int main(int argc, char **argv) { } } } +#endif printf("Success!\n"); return 0;