diff --git a/csrc/device_lower/pass/alias_memory.cpp b/csrc/device_lower/pass/alias_memory.cpp index 130492820f5..2865b457311 100644 --- a/csrc/device_lower/pass/alias_memory.cpp +++ b/csrc/device_lower/pass/alias_memory.cpp @@ -769,7 +769,13 @@ class AllocationInfoMap : private kir::IrVisitor { void handle(kir::ForLoop* for_loop) final { auto loop_info = scope_map_.getLoopScopeInfo(for_loop); - current_stack_.push_back(loop_info); + if (!for_loop->isTrivial()) { + // Parallelized loops do not result in for loops in the CUDA kernel, so + // they should not affect liveness analysis. This means that + // current_stack_ will differ from kir::IrVisitor::for_loops_, which will + // actually hold all ForLoops regardless of parallelization. + current_stack_.push_back(loop_info); + } if (debug_printer_) { debug_printer_->pushScope(); } @@ -777,7 +783,9 @@ class AllocationInfoMap : private kir::IrVisitor { if (debug_printer_) { debug_printer_->popScope(); } - current_stack_.pop_back(); + if (!for_loop->isTrivial()) { + current_stack_.pop_back(); + } } void handle(kir::IfThenElse* ite) final {