From 6e024b5ee63bf6f621444e33c8aa0de1eb198689 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Fri, 9 Apr 2021 13:50:06 -0700 Subject: [PATCH 01/11] Add debugging code to gpu_dynamic_shared to try to track down hard-to-repro bug on buildbots --- src/Debug.cpp | 19 +++++++++++++++---- src/Debug.h | 1 + test/correctness/gpu_dynamic_shared.cpp | 8 +++++++- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/Debug.cpp b/src/Debug.cpp index 22d26e83cc31..11b2c3358fb6 100644 --- a/src/Debug.cpp +++ b/src/Debug.cpp @@ -4,13 +4,24 @@ namespace Halide { namespace Internal { +namespace { + +int cached_debug_level = ([]() -> int { + std::string lvl = get_env_variable("HL_DEBUG_CODEGEN"); + return !lvl.empty() ? atoi(lvl.c_str()) : 0; +})(); + +} // namespace + int debug::debug_level() { - static int cached_debug_level = ([]() -> int { - std::string lvl = get_env_variable("HL_DEBUG_CODEGEN"); - return !lvl.empty() ? atoi(lvl.c_str()) : 0; - })(); return cached_debug_level; } +int debug::set_debug_level(int d) { + int old_level = cached_debug_level; + cached_debug_level = d; + return old_level; +} + } // namespace Internal } // namespace Halide diff --git a/src/Debug.h b/src/Debug.h index fadb5b4066ac..9f005b1f0805 100644 --- a/src/Debug.h +++ b/src/Debug.h @@ -63,6 +63,7 @@ class debug { } static int debug_level(); + static int set_debug_level(int d); }; } // namespace Internal diff --git a/test/correctness/gpu_dynamic_shared.cpp b/test/correctness/gpu_dynamic_shared.cpp index d43386f05980..46b79b14041f 100644 --- a/test/correctness/gpu_dynamic_shared.cpp +++ b/test/correctness/gpu_dynamic_shared.cpp @@ -15,6 +15,12 @@ int main(int argc, char **argv) { return 0; } + if (t.has_feature(Target::CUDA)) { + t.set_feature(Target::Debug); + t.set_feature(Target::DisableLLVMLoopOpt); + Halide::Internal::debug::set_debug_level(2); + } + // Check dynamic allocations per-block and per-thread into both // shared and global for (int per_thread = 0; per_thread < 2; per_thread++) { @@ -35,7 +41,7 @@ int main(int argc, char **argv) { f.store_in(memory_type); // The amount of shared/heap memory required varies with x - Buffer out = g.realize({100}); + Buffer out = g.realize({100}, t); for (int x = 0; x < 100; x++) { int correct = 3 * x; if (out(x) != correct) { From 87ecc7b6e95771dd1c1050d139ae6da340bc5e2a Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Mon, 12 Apr 2021 14:46:08 -0700 Subject: [PATCH 02/11] Update gpu_dynamic_shared.cpp --- test/correctness/gpu_dynamic_shared.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/correctness/gpu_dynamic_shared.cpp b/test/correctness/gpu_dynamic_shared.cpp index 46b79b14041f..e7fa95252be5 100644 --- a/test/correctness/gpu_dynamic_shared.cpp +++ b/test/correctness/gpu_dynamic_shared.cpp @@ -18,13 +18,15 @@ int main(int argc, char **argv) { if (t.has_feature(Target::CUDA)) { t.set_feature(Target::Debug); t.set_feature(Target::DisableLLVMLoopOpt); - Halide::Internal::debug::set_debug_level(2); + Halide::Internal::debug::set_debug_level(1); } // Check dynamic allocations per-block and per-thread into both // shared and global for (int per_thread = 0; per_thread < 2; per_thread++) { for (auto memory_type : {MemoryType::GPUShared, MemoryType::Heap}) { + printf("Testing: per_thread=%d, memory_type=%d\n", per_thread, (int) memory_type); + Func f("f"), g("g"); Var x("x"), xi("xi"); From ee26ddde4b0faa057f6734954db0ddabb33d73cb Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Mon, 12 Apr 2021 15:12:59 -0700 Subject: [PATCH 03/11] Update gpu_dynamic_shared.cpp --- test/correctness/gpu_dynamic_shared.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/correctness/gpu_dynamic_shared.cpp b/test/correctness/gpu_dynamic_shared.cpp index e7fa95252be5..80f6bf7ffed4 100644 --- a/test/correctness/gpu_dynamic_shared.cpp +++ b/test/correctness/gpu_dynamic_shared.cpp @@ -25,7 +25,7 @@ int main(int argc, char **argv) { // shared and global for (int per_thread = 0; per_thread < 2; per_thread++) { for (auto memory_type : {MemoryType::GPUShared, MemoryType::Heap}) { - printf("Testing: per_thread=%d, memory_type=%d\n", per_thread, (int) memory_type); + printf("Testing: per_thread=%d, memory_type=%d\n", per_thread, (int)memory_type); Func f("f"), g("g"); Var x("x"), xi("xi"); From 99de4f2e54165c4447ca92b8af3b2a4496af0c6b Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Mon, 12 Apr 2021 18:32:33 -0700 Subject: [PATCH 04/11] Update gpu_dynamic_shared.cpp --- test/correctness/gpu_dynamic_shared.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/test/correctness/gpu_dynamic_shared.cpp b/test/correctness/gpu_dynamic_shared.cpp index 80f6bf7ffed4..6febc4808bc0 100644 --- a/test/correctness/gpu_dynamic_shared.cpp +++ b/test/correctness/gpu_dynamic_shared.cpp @@ -23,6 +23,29 @@ int main(int argc, char **argv) { // Check dynamic allocations per-block and per-thread into both // shared and global +#if 1 + Func f("f"), g("g"); + Var x("x"), xi("xi"); + + f(x) = x; + g(x) = f(x) + f(2 * x); + + g.gpu_tile(x, xi, 16); + f.compute_at(g, xi); + + f.store_in(MemoryType::GPUShared); + + // The amount of shared/heap memory required varies with x + Buffer out = g.realize({100}, t); + for (int x = 0; x < 100; x++) { + int correct = 3 * x; + if (out(x) != correct) { + printf("out(%d) = %d instead of %d\n", + x, out(x), correct); + return -1; + } + } +#else for (int per_thread = 0; per_thread < 2; per_thread++) { for (auto memory_type : {MemoryType::GPUShared, MemoryType::Heap}) { printf("Testing: per_thread=%d, memory_type=%d\n", per_thread, (int)memory_type); @@ -54,6 +77,7 @@ int main(int argc, char **argv) { } } } +#endif printf("Success!\n"); return 0; From 4e92e7b5a709916ae99b8c97fb43d988ae7962a1 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Tue, 13 Apr 2021 10:06:05 -0700 Subject: [PATCH 05/11] wip --- src/Pipeline.cpp | 2 +- src/runtime/cuda.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Pipeline.cpp b/src/Pipeline.cpp index f552cee81259..deac3bea7692 100644 --- a/src/Pipeline.cpp +++ b/src/Pipeline.cpp @@ -882,7 +882,7 @@ struct JITFuncCallContext { std::to_string(exit_status) + " but halide_error was never called.\n"); } - halide_runtime_error << output; + halide_runtime_error << "ZZZ(" << output << ")ZZZ"; error_buffer.end = 0; } } diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp index 0846bd9b5c40..c66afd3b9af0 100644 --- a/src/runtime/cuda.cpp +++ b/src/runtime/cuda.cpp @@ -1024,7 +1024,7 @@ WEAK int halide_cuda_device_sync(void *user_context, struct halide_buffer_t *) { err = cuCtxSynchronize(); } if (err != CUDA_SUCCESS) { - error(user_context) << "CUDA: cuCtxSynchronize failed: " + error(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_device_sync): " << get_error_name(err); return err; } @@ -1136,7 +1136,7 @@ WEAK int halide_cuda_run(void *user_context, #ifdef DEBUG_RUNTIME err = cuCtxSynchronize(); if (err != CUDA_SUCCESS) { - error(user_context) << "CUDA: cuCtxSynchronize failed: " + error(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_run): " << get_error_name(err); return err; } From 0a380fd47770af8f68978bb4ad034158ba08b59f Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Tue, 13 Apr 2021 10:12:54 -0700 Subject: [PATCH 06/11] Update cuda.cpp --- src/runtime/cuda.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp index c66afd3b9af0..7eb0b5761fb6 100644 --- a/src/runtime/cuda.cpp +++ b/src/runtime/cuda.cpp @@ -1057,6 +1057,8 @@ WEAK int halide_cuda_run(void *user_context, CUresult err; Context ctx(user_context); if (ctx.error != CUDA_SUCCESS) { + error(user_context) << "CUDA: Context failed: " + << get_error_name(ctx.error); return ctx.error; } @@ -1143,6 +1145,8 @@ WEAK int halide_cuda_run(void *user_context, uint64_t t_after = halide_current_time_ns(user_context); debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n"; #endif + + debug(user_context) << "CUDA: halide_cuda_run succeeds!\n"; return 0; } From f99087e60ef24e01193dcb11bb8025de7dbe15fe Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Tue, 13 Apr 2021 10:14:57 -0700 Subject: [PATCH 07/11] Update cuda.cpp --- src/runtime/cuda.cpp | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp index 7eb0b5761fb6..9cbdb67421ff 100644 --- a/src/runtime/cuda.cpp +++ b/src/runtime/cuda.cpp @@ -684,6 +684,11 @@ WEAK int halide_cuda_device_release(void *user_context) { err = cuCtxPushCurrent(ctx); if (err != CUDA_SUCCESS) { err = cuCtxSynchronize(); + if (err != CUDA_SUCCESS) { + debug(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_device_release): " + << get_error_name(err); + // do not return! + } } halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED); @@ -1020,13 +1025,18 @@ WEAK int halide_cuda_device_sync(void *user_context, struct halide_buffer_t *) { error(user_context) << "CUDA: In halide_cuda_device_sync, halide_cuda_get_stream returned " << result << "\n"; } err = cuStreamSynchronize(stream); + if (err != CUDA_SUCCESS) { + error(user_context) << "CUDA: cuStreamSynchronize failed (halide_cuda_device_sync): " + << get_error_name(err); + return err; + } } else { err = cuCtxSynchronize(); - } - if (err != CUDA_SUCCESS) { - error(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_device_sync): " - << get_error_name(err); - return err; + if (err != CUDA_SUCCESS) { + error(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_device_sync): " + << get_error_name(err); + return err; + } } #ifdef DEBUG_RUNTIME From 0245bacd035b082ef5dd070d08df2b956d50f28c Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Tue, 13 Apr 2021 10:18:11 -0700 Subject: [PATCH 08/11] Update cuda.cpp --- src/runtime/cuda.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp index 9cbdb67421ff..12f3cd404ceb 100644 --- a/src/runtime/cuda.cpp +++ b/src/runtime/cuda.cpp @@ -686,7 +686,7 @@ WEAK int halide_cuda_device_release(void *user_context) { err = cuCtxSynchronize(); if (err != CUDA_SUCCESS) { debug(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_device_release): " - << get_error_name(err); + << get_error_name((CUresult)err); // do not return! } } @@ -1068,7 +1068,7 @@ WEAK int halide_cuda_run(void *user_context, Context ctx(user_context); if (ctx.error != CUDA_SUCCESS) { error(user_context) << "CUDA: Context failed: " - << get_error_name(ctx.error); + << get_error_name((CUresult)ctx.error); return ctx.error; } From 08c36eeb0d1ec70fe35225edd129a88e1d2cc024 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Tue, 13 Apr 2021 10:29:45 -0700 Subject: [PATCH 09/11] Update gpu_dynamic_shared.cpp --- test/correctness/gpu_dynamic_shared.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/test/correctness/gpu_dynamic_shared.cpp b/test/correctness/gpu_dynamic_shared.cpp index 6febc4808bc0..dc807c620643 100644 --- a/test/correctness/gpu_dynamic_shared.cpp +++ b/test/correctness/gpu_dynamic_shared.cpp @@ -4,11 +4,12 @@ using namespace Halide; int main(int argc, char **argv) { - Target t = get_jit_target_from_environment(); - if (!t.has_gpu_feature()) { - printf("[SKIP] No GPU target enabled.\n"); - return 0; - } + Target t = Target("host-cuda"); + // get_jit_target_from_environment(); + // if (!t.has_gpu_feature()) { + // printf("[SKIP] No GPU target enabled.\n"); + // return 0; + // } if (t.has_feature(Target::OpenGLCompute)) { printf("[SKIP] Skipping test for OpenGLCompute, as it does not support dynamically-sized shared memory\n"); @@ -18,7 +19,7 @@ int main(int argc, char **argv) { if (t.has_feature(Target::CUDA)) { t.set_feature(Target::Debug); t.set_feature(Target::DisableLLVMLoopOpt); - Halide::Internal::debug::set_debug_level(1); + Halide::Internal::debug::set_debug_level(2); } // Check dynamic allocations per-block and per-thread into both From 4bb78f894e0b82cb6f473926f9cd40bfac013e32 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Tue, 13 Apr 2021 10:50:45 -0700 Subject: [PATCH 10/11] wip --- src/CodeGen_PTX_Dev.cpp | 3 ++- src/Target.cpp | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index 3b354e51e342..c6c6cfcffa34 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -718,7 +718,7 @@ vector CodeGen_PTX_Dev::compile_to_src() { if (debug::debug_level() >= 2) { dump(); } - debug(2) << "Done with CodeGen_PTX_Dev::compile_to_src"; + debug(2) << "Done with CodeGen_PTX_Dev::compile_to_src\n"; debug(1) << "PTX kernel:\n" << outstr.c_str() << "\n"; @@ -739,6 +739,7 @@ vector CodeGen_PTX_Dev::compile_to_src() { string cmd = "ptxas --gpu-name " + mcpu() + " " + ptx.pathname() + " -o " + sass.pathname(); if (system(cmd.c_str()) == 0) { cmd = "nvdisasm " + sass.pathname(); + debug(2) << "ptxas cmdline: (" << cmd << ")\n"; int ret = system(cmd.c_str()); (void)ret; // Don't care if it fails } diff --git a/src/Target.cpp b/src/Target.cpp index f154a535ff1a..66c249f8526d 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -241,6 +241,7 @@ Target::Feature calculate_host_cuda_capability(Target t) { int err = interface->compute_capability(nullptr, &major, &minor); internal_assert(err == 0) << "Failed to query cuda compute capability\n"; int ver = major * 10 + minor; + Internal::debug(0) <<"CUDA capability is " << major << "." << minor << " -> " << ver << "\n"; if (ver < 30) { return Target::FeatureEnd; } else if (ver < 32) { From 0b84e1e334dca1bd6f9a369dd34f2cc1bab75010 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Tue, 13 Apr 2021 10:53:34 -0700 Subject: [PATCH 11/11] Update CodeGen_PTX_Dev.cpp --- src/CodeGen_PTX_Dev.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index c6c6cfcffa34..09ca05d65218 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -737,12 +737,15 @@ vector CodeGen_PTX_Dev::compile_to_src() { f.close(); string cmd = "ptxas --gpu-name " + mcpu() + " " + ptx.pathname() + " -o " + sass.pathname(); + debug(2) << "ptxas cmdline: (" << cmd << ")\n"; if (system(cmd.c_str()) == 0) { cmd = "nvdisasm " + sass.pathname(); - debug(2) << "ptxas cmdline: (" << cmd << ")\n"; + debug(2) << "nvdisasm cmdline: (" << cmd << ")\n"; int ret = system(cmd.c_str()); (void)ret; // Don't care if it fails } +ptx.detach(); +sass.detach(); // Note: It works to embed the contents of the .sass file in // the buffer instead of the ptx source, and this could help