From 6e024b5ee63bf6f621444e33c8aa0de1eb198689 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Fri, 9 Apr 2021 13:50:06 -0700
Subject: [PATCH 01/11] Add debugging code to gpu_dynamic_shared to try to
 track down hard-to-repro bug on buildbots

---
 src/Debug.cpp                           | 19 +++++++++++++++----
 src/Debug.h                             |  1 +
 test/correctness/gpu_dynamic_shared.cpp |  8 +++++++-
 3 files changed, 23 insertions(+), 5 deletions(-)
diff --git a/src/Debug.cpp b/src/Debug.cpp
index 22d26e83cc31..11b2c3358fb6 100644
--- a/src/Debug.cpp
+++ b/src/Debug.cpp
@@ -4,13 +4,24 @@
 namespace Halide {
 namespace Internal {
 
+namespace {
+
+int cached_debug_level = ([]() -> int {
+    std::string lvl = get_env_variable("HL_DEBUG_CODEGEN");
+    return !lvl.empty() ? atoi(lvl.c_str()) : 0;
+})();
+
+}  // namespace
+
 int debug::debug_level() {
-    static int cached_debug_level = ([]() -> int {
-        std::string lvl = get_env_variable("HL_DEBUG_CODEGEN");
-        return !lvl.empty() ? atoi(lvl.c_str()) : 0;
-    })();
     return cached_debug_level;
 }
 
+int debug::set_debug_level(int d) {
+    int old_level = cached_debug_level;
+    cached_debug_level = d;
+    return old_level;
+}
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/Debug.h b/src/Debug.h
index fadb5b4066ac..9f005b1f0805 100644
--- a/src/Debug.h
+++ b/src/Debug.h
@@ -63,6 +63,7 @@ class debug {
     }
 
     static int debug_level();
+    static int set_debug_level(int d);
 };
 
 }  // namespace Internal
diff --git a/test/correctness/gpu_dynamic_shared.cpp b/test/correctness/gpu_dynamic_shared.cpp
index d43386f05980..46b79b14041f 100644
--- a/test/correctness/gpu_dynamic_shared.cpp
+++ b/test/correctness/gpu_dynamic_shared.cpp
@@ -15,6 +15,12 @@ int main(int argc, char **argv) {
         return 0;
     }
 
+    if (t.has_feature(Target::CUDA)) {
+        t.set_feature(Target::Debug);
+        t.set_feature(Target::DisableLLVMLoopOpt);
+        Halide::Internal::debug::set_debug_level(2);
+    }
+
     // Check dynamic allocations per-block and per-thread into both
     // shared and global
     for (int per_thread = 0; per_thread < 2; per_thread++) {
@@ -35,7 +41,7 @@ int main(int argc, char **argv) {
             f.store_in(memory_type);
 
             // The amount of shared/heap memory required varies with x
-            Buffer<int> out = g.realize({100});
+            Buffer<int> out = g.realize({100}, t);
             for (int x = 0; x < 100; x++) {
                 int correct = 3 * x;
                 if (out(x) != correct) {

From 87ecc7b6e95771dd1c1050d139ae6da340bc5e2a Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Mon, 12 Apr 2021 14:46:08 -0700
Subject: [PATCH 02/11] Update gpu_dynamic_shared.cpp

---
 test/correctness/gpu_dynamic_shared.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/correctness/gpu_dynamic_shared.cpp b/test/correctness/gpu_dynamic_shared.cpp
index 46b79b14041f..e7fa95252be5 100644
--- a/test/correctness/gpu_dynamic_shared.cpp
+++ b/test/correctness/gpu_dynamic_shared.cpp
@@ -18,13 +18,15 @@ int main(int argc, char **argv) {
     if (t.has_feature(Target::CUDA)) {
         t.set_feature(Target::Debug);
         t.set_feature(Target::DisableLLVMLoopOpt);
-        Halide::Internal::debug::set_debug_level(2);
+        Halide::Internal::debug::set_debug_level(1);
     }
 
     // Check dynamic allocations per-block and per-thread into both
     // shared and global
     for (int per_thread = 0; per_thread < 2; per_thread++) {
         for (auto memory_type : {MemoryType::GPUShared, MemoryType::Heap}) {
+            printf("Testing: per_thread=%d, memory_type=%d\n", per_thread, (int) memory_type);
+
             Func f("f"), g("g");
             Var x("x"), xi("xi");
 

From ee26ddde4b0faa057f6734954db0ddabb33d73cb Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Mon, 12 Apr 2021 15:12:59 -0700
Subject: [PATCH 03/11] Update gpu_dynamic_shared.cpp

---
 test/correctness/gpu_dynamic_shared.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/correctness/gpu_dynamic_shared.cpp b/test/correctness/gpu_dynamic_shared.cpp
index e7fa95252be5..80f6bf7ffed4 100644
--- a/test/correctness/gpu_dynamic_shared.cpp
+++ b/test/correctness/gpu_dynamic_shared.cpp
@@ -25,7 +25,7 @@ int main(int argc, char **argv) {
     // shared and global
     for (int per_thread = 0; per_thread < 2; per_thread++) {
         for (auto memory_type : {MemoryType::GPUShared, MemoryType::Heap}) {
-            printf("Testing: per_thread=%d, memory_type=%d\n", per_thread, (int) memory_type);
+            printf("Testing: per_thread=%d, memory_type=%d\n", per_thread, (int)memory_type);
 
             Func f("f"), g("g");
             Var x("x"), xi("xi");

From 99de4f2e54165c4447ca92b8af3b2a4496af0c6b Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Mon, 12 Apr 2021 18:32:33 -0700
Subject: [PATCH 04/11] Update gpu_dynamic_shared.cpp

---
 test/correctness/gpu_dynamic_shared.cpp | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/test/correctness/gpu_dynamic_shared.cpp b/test/correctness/gpu_dynamic_shared.cpp
index 80f6bf7ffed4..6febc4808bc0 100644
--- a/test/correctness/gpu_dynamic_shared.cpp
+++ b/test/correctness/gpu_dynamic_shared.cpp
@@ -23,6 +23,29 @@ int main(int argc, char **argv) {
 
     // Check dynamic allocations per-block and per-thread into both
     // shared and global
+#if 1
+    Func f("f"), g("g");
+    Var x("x"), xi("xi");
+
+    f(x) = x;
+    g(x) = f(x) + f(2 * x);
+
+    g.gpu_tile(x, xi, 16);
+    f.compute_at(g, xi);
+
+    f.store_in(MemoryType::GPUShared);
+
+    // The amount of shared/heap memory required varies with x
+    Buffer<int> out = g.realize({100}, t);
+    for (int x = 0; x < 100; x++) {
+        int correct = 3 * x;
+        if (out(x) != correct) {
+            printf("out(%d) = %d instead of %d\n",
+                   x, out(x), correct);
+            return -1;
+        }
+    }
+#else
     for (int per_thread = 0; per_thread < 2; per_thread++) {
         for (auto memory_type : {MemoryType::GPUShared, MemoryType::Heap}) {
             printf("Testing: per_thread=%d, memory_type=%d\n", per_thread, (int)memory_type);
@@ -54,6 +77,7 @@ int main(int argc, char **argv) {
             }
         }
     }
+#endif
 
     printf("Success!\n");
     return 0;

From 4e92e7b5a709916ae99b8c97fb43d988ae7962a1 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Tue, 13 Apr 2021 10:06:05 -0700
Subject: [PATCH 05/11] wip

---
 src/Pipeline.cpp     | 2 +-
 src/runtime/cuda.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Pipeline.cpp b/src/Pipeline.cpp
index f552cee81259..deac3bea7692 100644
--- a/src/Pipeline.cpp
+++ b/src/Pipeline.cpp
@@ -882,7 +882,7 @@ struct JITFuncCallContext {
                           std::to_string(exit_status) +
                           " but halide_error was never called.\n");
             }
-            halide_runtime_error << output;
+            halide_runtime_error << "ZZZ(" << output << ")ZZZ";
             error_buffer.end = 0;
         }
     }
diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 0846bd9b5c40..c66afd3b9af0 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -1024,7 +1024,7 @@ WEAK int halide_cuda_device_sync(void *user_context, struct halide_buffer_t *) {
         err = cuCtxSynchronize();
     }
     if (err != CUDA_SUCCESS) {
-        error(user_context) << "CUDA: cuCtxSynchronize failed: "
+        error(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_device_sync): "
                             << get_error_name(err);
         return err;
     }
@@ -1136,7 +1136,7 @@ WEAK int halide_cuda_run(void *user_context,
 #ifdef DEBUG_RUNTIME
     err = cuCtxSynchronize();
     if (err != CUDA_SUCCESS) {
-        error(user_context) << "CUDA: cuCtxSynchronize failed: "
+        error(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_run): "
                             << get_error_name(err);
         return err;
     }

From 0a380fd47770af8f68978bb4ad034158ba08b59f Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Tue, 13 Apr 2021 10:12:54 -0700
Subject: [PATCH 06/11] Update cuda.cpp

---
 src/runtime/cuda.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index c66afd3b9af0..7eb0b5761fb6 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -1057,6 +1057,8 @@ WEAK int halide_cuda_run(void *user_context,
     CUresult err;
     Context ctx(user_context);
     if (ctx.error != CUDA_SUCCESS) {
+        error(user_context) << "CUDA: Context failed: "
+                            << get_error_name(ctx.error);
         return ctx.error;
     }
 
@@ -1143,6 +1145,8 @@ WEAK int halide_cuda_run(void *user_context,
     uint64_t t_after = halide_current_time_ns(user_context);
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
 #endif
+
+    debug(user_context) << "CUDA: halide_cuda_run succeeds!\n";
     return 0;
 }
 

From f99087e60ef24e01193dcb11bb8025de7dbe15fe Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Tue, 13 Apr 2021 10:14:57 -0700
Subject: [PATCH 07/11] Update cuda.cpp

---
 src/runtime/cuda.cpp | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 7eb0b5761fb6..9cbdb67421ff 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -684,6 +684,11 @@ WEAK int halide_cuda_device_release(void *user_context) {
         err = cuCtxPushCurrent(ctx);
         if (err != CUDA_SUCCESS) {
             err = cuCtxSynchronize();
+            if (err != CUDA_SUCCESS) {
+                debug(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_device_release): "
+                                    << get_error_name(err);
+                // do not return!
+            }
         }
         halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
 
@@ -1020,13 +1025,18 @@ WEAK int halide_cuda_device_sync(void *user_context, struct halide_buffer_t *) {
             error(user_context) << "CUDA: In halide_cuda_device_sync, halide_cuda_get_stream returned " << result << "\n";
         }
         err = cuStreamSynchronize(stream);
+        if (err != CUDA_SUCCESS) {
+            error(user_context) << "CUDA: cuStreamSynchronize failed (halide_cuda_device_sync): "
+                                << get_error_name(err);
+            return err;
+        }
     } else {
         err = cuCtxSynchronize();
-    }
-    if (err != CUDA_SUCCESS) {
-        error(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_device_sync): "
-                            << get_error_name(err);
-        return err;
+        if (err != CUDA_SUCCESS) {
+            error(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_device_sync): "
+                                << get_error_name(err);
+            return err;
+        }
     }
 
 #ifdef DEBUG_RUNTIME

From 0245bacd035b082ef5dd070d08df2b956d50f28c Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Tue, 13 Apr 2021 10:18:11 -0700
Subject: [PATCH 08/11] Update cuda.cpp

---
 src/runtime/cuda.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 9cbdb67421ff..12f3cd404ceb 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -686,7 +686,7 @@ WEAK int halide_cuda_device_release(void *user_context) {
             err = cuCtxSynchronize();
             if (err != CUDA_SUCCESS) {
                 debug(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_device_release): "
-                                    << get_error_name(err);
+                                    << get_error_name((CUresult)err);
                 // do not return!
             }
         }
@@ -1068,7 +1068,7 @@ WEAK int halide_cuda_run(void *user_context,
     Context ctx(user_context);
     if (ctx.error != CUDA_SUCCESS) {
         error(user_context) << "CUDA: Context failed: "
-                            << get_error_name(ctx.error);
+                            << get_error_name((CUresult)ctx.error);
         return ctx.error;
     }
 

From 08c36eeb0d1ec70fe35225edd129a88e1d2cc024 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Tue, 13 Apr 2021 10:29:45 -0700
Subject: [PATCH 09/11] Update gpu_dynamic_shared.cpp

---
 test/correctness/gpu_dynamic_shared.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/test/correctness/gpu_dynamic_shared.cpp b/test/correctness/gpu_dynamic_shared.cpp
index 6febc4808bc0..dc807c620643 100644
--- a/test/correctness/gpu_dynamic_shared.cpp
+++ b/test/correctness/gpu_dynamic_shared.cpp
@@ -4,11 +4,12 @@
 using namespace Halide;
 
 int main(int argc, char **argv) {
-    Target t = get_jit_target_from_environment();
-    if (!t.has_gpu_feature()) {
-        printf("[SKIP] No GPU target enabled.\n");
-        return 0;
-    }
+    Target t = Target("host-cuda");
+    // get_jit_target_from_environment();
+    // if (!t.has_gpu_feature()) {
+    //     printf("[SKIP] No GPU target enabled.\n");
+    //     return 0;
+    // }
 
     if (t.has_feature(Target::OpenGLCompute)) {
         printf("[SKIP] Skipping test for OpenGLCompute, as it does not support dynamically-sized shared memory\n");
@@ -18,7 +19,7 @@ int main(int argc, char **argv) {
     if (t.has_feature(Target::CUDA)) {
         t.set_feature(Target::Debug);
         t.set_feature(Target::DisableLLVMLoopOpt);
-        Halide::Internal::debug::set_debug_level(1);
+        Halide::Internal::debug::set_debug_level(2);
     }
 
     // Check dynamic allocations per-block and per-thread into both

From 4bb78f894e0b82cb6f473926f9cd40bfac013e32 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Tue, 13 Apr 2021 10:50:45 -0700
Subject: [PATCH 10/11] wip

---
 src/CodeGen_PTX_Dev.cpp | 3 ++-
 src/Target.cpp          | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index 3b354e51e342..c6c6cfcffa34 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -718,7 +718,7 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
     if (debug::debug_level() >= 2) {
         dump();
     }
-    debug(2) << "Done with CodeGen_PTX_Dev::compile_to_src";
+    debug(2) << "Done with CodeGen_PTX_Dev::compile_to_src\n";
 
     debug(1) << "PTX kernel:\n"
              << outstr.c_str() << "\n";
@@ -739,6 +739,7 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
         string cmd = "ptxas --gpu-name " + mcpu() + " " + ptx.pathname() + " -o " + sass.pathname();
         if (system(cmd.c_str()) == 0) {
             cmd = "nvdisasm " + sass.pathname();
+            debug(2) << "ptxas cmdline: (" << cmd << ")\n";
             int ret = system(cmd.c_str());
             (void)ret;  // Don't care if it fails
         }
diff --git a/src/Target.cpp b/src/Target.cpp
index f154a535ff1a..66c249f8526d 100644
--- a/src/Target.cpp
+++ b/src/Target.cpp
@@ -241,6 +241,7 @@ Target::Feature calculate_host_cuda_capability(Target t) {
     int err = interface->compute_capability(nullptr, &major, &minor);
     internal_assert(err == 0) << "Failed to query cuda compute capability\n";
     int ver = major * 10 + minor;
+    Internal::debug(0) <<"CUDA capability is " << major << "." << minor << " -> " << ver << "\n";
     if (ver < 30) {
         return Target::FeatureEnd;
     } else if (ver < 32) {

From 0b84e1e334dca1bd6f9a369dd34f2cc1bab75010 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Tue, 13 Apr 2021 10:53:34 -0700
Subject: [PATCH 11/11] Update CodeGen_PTX_Dev.cpp

---
 src/CodeGen_PTX_Dev.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index c6c6cfcffa34..09ca05d65218 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -737,12 +737,15 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
         f.close();
 
         string cmd = "ptxas --gpu-name " + mcpu() + " " + ptx.pathname() + " -o " + sass.pathname();
+        debug(2) << "ptxas cmdline: (" << cmd << ")\n";
         if (system(cmd.c_str()) == 0) {
             cmd = "nvdisasm " + sass.pathname();
-            debug(2) << "ptxas cmdline: (" << cmd << ")\n";
+            debug(2) << "nvdisasm cmdline: (" << cmd << ")\n";
             int ret = system(cmd.c_str());
             (void)ret;  // Don't care if it fails
         }
+ptx.detach();
+sass.detach();
 
         // Note: It works to embed the contents of the .sass file in
         // the buffer instead of the ptx source, and this could help