diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index 3b354e51e342..09ca05d65218 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -718,7 +718,7 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
     if (debug::debug_level() >= 2) {
         dump();
     }
-    debug(2) << "Done with CodeGen_PTX_Dev::compile_to_src";
+    debug(2) << "Done with CodeGen_PTX_Dev::compile_to_src\n";
 
     debug(1) << "PTX kernel:\n"
              << outstr.c_str() << "\n";
@@ -737,11 +737,15 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
         f.close();
 
         string cmd = "ptxas --gpu-name " + mcpu() + " " + ptx.pathname() + " -o " + sass.pathname();
+        debug(2) << "ptxas cmdline: (" << cmd << ")\n";
         if (system(cmd.c_str()) == 0) {
             cmd = "nvdisasm " + sass.pathname();
+            debug(2) << "nvdisasm cmdline: (" << cmd << ")\n";
             int ret = system(cmd.c_str());
             (void)ret;  // Don't care if it fails
         }
+ptx.detach();
+sass.detach();
 
         // Note: It works to embed the contents of the .sass file in
         // the buffer instead of the ptx source, and this could help
diff --git a/src/Debug.cpp b/src/Debug.cpp
index 22d26e83cc31..11b2c3358fb6 100644
--- a/src/Debug.cpp
+++ b/src/Debug.cpp
@@ -4,13 +4,24 @@
 namespace Halide {
 namespace Internal {
 
+namespace {
+
+int cached_debug_level = ([]() -> int {
+    std::string lvl = get_env_variable("HL_DEBUG_CODEGEN");
+    return !lvl.empty() ? atoi(lvl.c_str()) : 0;
+})();
+
+}  // namespace
+
 int debug::debug_level() {
-    static int cached_debug_level = ([]() -> int {
-        std::string lvl = get_env_variable("HL_DEBUG_CODEGEN");
-        return !lvl.empty() ? atoi(lvl.c_str()) : 0;
-    })();
     return cached_debug_level;
 }
 
+int debug::set_debug_level(int d) {
+    int old_level = cached_debug_level;
+    cached_debug_level = d;
+    return old_level;
+}
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/Debug.h b/src/Debug.h
index fadb5b4066ac..9f005b1f0805 100644
--- a/src/Debug.h
+++ b/src/Debug.h
@@ -63,6 +63,7 @@ class debug {
     }
 
     static int debug_level();
+    static int set_debug_level(int d);
 };
 
 }  // namespace Internal
diff --git a/src/Pipeline.cpp b/src/Pipeline.cpp
index f552cee81259..deac3bea7692 100644
--- a/src/Pipeline.cpp
+++ b/src/Pipeline.cpp
@@ -882,7 +882,7 @@ struct JITFuncCallContext {
                           std::to_string(exit_status) +
                           " but halide_error was never called.\n");
             }
-            halide_runtime_error << output;
+            halide_runtime_error << "ZZZ(" << output << ")ZZZ";
             error_buffer.end = 0;
         }
     }
diff --git a/src/Target.cpp b/src/Target.cpp
index f154a535ff1a..66c249f8526d 100644
--- a/src/Target.cpp
+++ b/src/Target.cpp
@@ -241,6 +241,7 @@ Target::Feature calculate_host_cuda_capability(Target t) {
     int err = interface->compute_capability(nullptr, &major, &minor);
     internal_assert(err == 0) << "Failed to query cuda compute capability\n";
     int ver = major * 10 + minor;
+    Internal::debug(0) <<"CUDA capability is " << major << "." << minor << " -> " << ver << "\n";
     if (ver < 30) {
         return Target::FeatureEnd;
     } else if (ver < 32) {
diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 0846bd9b5c40..12f3cd404ceb 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -684,6 +684,11 @@ WEAK int halide_cuda_device_release(void *user_context) {
         err = cuCtxPushCurrent(ctx);
         if (err != CUDA_SUCCESS) {
             err = cuCtxSynchronize();
+            if (err != CUDA_SUCCESS) {
+                debug(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_device_release): "
+                                    << get_error_name((CUresult)err);
+                // do not return!
+            }
         }
         halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
 
@@ -1020,13 +1025,18 @@ WEAK int halide_cuda_device_sync(void *user_context, struct halide_buffer_t *) {
             error(user_context) << "CUDA: In halide_cuda_device_sync, halide_cuda_get_stream returned " << result << "\n";
         }
         err = cuStreamSynchronize(stream);
+        if (err != CUDA_SUCCESS) {
+            error(user_context) << "CUDA: cuStreamSynchronize failed (halide_cuda_device_sync): "
+                                << get_error_name(err);
+            return err;
+        }
     } else {
         err = cuCtxSynchronize();
-    }
-    if (err != CUDA_SUCCESS) {
-        error(user_context) << "CUDA: cuCtxSynchronize failed: "
-                            << get_error_name(err);
-        return err;
+        if (err != CUDA_SUCCESS) {
+            error(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_device_sync): "
+                                << get_error_name(err);
+            return err;
+        }
     }
 
 #ifdef DEBUG_RUNTIME
@@ -1057,6 +1067,8 @@ WEAK int halide_cuda_run(void *user_context,
     CUresult err;
     Context ctx(user_context);
     if (ctx.error != CUDA_SUCCESS) {
+        error(user_context) << "CUDA: Context failed: "
+                            << get_error_name((CUresult)ctx.error);
         return ctx.error;
     }
 
@@ -1136,13 +1148,15 @@ WEAK int halide_cuda_run(void *user_context,
 #ifdef DEBUG_RUNTIME
     err = cuCtxSynchronize();
     if (err != CUDA_SUCCESS) {
-        error(user_context) << "CUDA: cuCtxSynchronize failed: "
+        error(user_context) << "CUDA: cuCtxSynchronize failed (halide_cuda_run): "
                             << get_error_name(err);
         return err;
     }
     uint64_t t_after = halide_current_time_ns(user_context);
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
 #endif
+
+    debug(user_context) << "CUDA: halide_cuda_run succeeds!\n";
     return 0;
 }
 
diff --git a/test/correctness/gpu_dynamic_shared.cpp b/test/correctness/gpu_dynamic_shared.cpp
index d43386f05980..dc807c620643 100644
--- a/test/correctness/gpu_dynamic_shared.cpp
+++ b/test/correctness/gpu_dynamic_shared.cpp
@@ -4,21 +4,53 @@
 using namespace Halide;
 
 int main(int argc, char **argv) {
-    Target t = get_jit_target_from_environment();
-    if (!t.has_gpu_feature()) {
-        printf("[SKIP] No GPU target enabled.\n");
-        return 0;
-    }
+    Target t = Target("host-cuda");
+    // get_jit_target_from_environment();
+    // if (!t.has_gpu_feature()) {
+    //     printf("[SKIP] No GPU target enabled.\n");
+    //     return 0;
+    // }
 
     if (t.has_feature(Target::OpenGLCompute)) {
         printf("[SKIP] Skipping test for OpenGLCompute, as it does not support dynamically-sized shared memory\n");
         return 0;
     }
 
+    if (t.has_feature(Target::CUDA)) {
+        t.set_feature(Target::Debug);
+        t.set_feature(Target::DisableLLVMLoopOpt);
+        Halide::Internal::debug::set_debug_level(2);
+    }
+
     // Check dynamic allocations per-block and per-thread into both
     // shared and global
+#if 1
+    Func f("f"), g("g");
+    Var x("x"), xi("xi");
+
+    f(x) = x;
+    g(x) = f(x) + f(2 * x);
+
+    g.gpu_tile(x, xi, 16);
+    f.compute_at(g, xi);
+
+    f.store_in(MemoryType::GPUShared);
+
+    // The amount of shared/heap memory required varies with x
+    Buffer<int> out = g.realize({100}, t);
+    for (int x = 0; x < 100; x++) {
+        int correct = 3 * x;
+        if (out(x) != correct) {
+            printf("out(%d) = %d instead of %d\n",
+                   x, out(x), correct);
+            return -1;
+        }
+    }
+#else
     for (int per_thread = 0; per_thread < 2; per_thread++) {
         for (auto memory_type : {MemoryType::GPUShared, MemoryType::Heap}) {
+            printf("Testing: per_thread=%d, memory_type=%d\n", per_thread, (int)memory_type);
+
             Func f("f"), g("g");
             Var x("x"), xi("xi");
 
@@ -35,7 +67,7 @@ int main(int argc, char **argv) {
             f.store_in(memory_type);
 
             // The amount of shared/heap memory required varies with x
-            Buffer<int> out = g.realize({100});
+            Buffer<int> out = g.realize({100}, t);
             for (int x = 0; x < 100; x++) {
                 int correct = 3 * x;
                 if (out(x) != correct) {
@@ -46,6 +78,7 @@ int main(int argc, char **argv) {
             }
         }
     }
+#endif
 
     printf("Success!\n");
     return 0;