From bfb58c54b3668e5821db426ed55170aab37cd43e Mon Sep 17 00:00:00 2001
From: Matt Toohey <contact@matttoohey.com>
Date: Thu, 19 Feb 2026 11:28:16 -0800
Subject: [PATCH 1/2] fix: reliably stop running actions by killing process
 groups and fixing lifecycle

Previously, clicking Stop on a running action would fail to actually stop
the underlying process and would prematurely show a 'stopped' status in the
UI while the process continued running. Four root causes were identified
and fixed:

1. Process group killing: The shell was spawned without its own process
   group, so SIGTERM only reached the shell process while child processes
   (npm, cargo, etc.) continued as orphans. Now we call setsid() via
   pre_exec to create a new session/process group, and send SIGTERM to
   the negative PID (-pgid) to kill the entire tree. A background thread
   escalates to SIGKILL after 5 seconds if the group is still alive.

2. Premature UI status: The frontend set status='stopped' immediately in
   handleStop() before the backend confirmed the process actually exited.
   Now the frontend waits for the backend's StatusChanged event to update
   the status, showing 'Stopping...' on a disabled button in the interim.

3. Running map cleanup: stop() previously removed the entry from the
   running map immediately, which meant the completion thread couldn't
   find the output buffer to move it to the completed map, losing all
   buffered output. Now stop() only sends the signal; the completion
   thread handles all cleanup.

4. Stopped vs Failed status: The completion thread only emitted Completed
   or Failed based on exit code. Killed processes exit with a signal (no
   exit code), so they were reported as Failed. Now a 'stopped' HashSet
   tracks intentional stops, and the completion thread checks it to emit
   the correct Stopped status. Auto-commit is also skipped for stopped
   actions.
---
 crates/builderbot-actions/src/executor.rs     | 145 ++++++++++++------
 .../features/actions/ActionOutputModal.svelte |  21 ++-
 2 files changed, 114 insertions(+), 52 deletions(-)
diff --git a/crates/builderbot-actions/src/executor.rs b/crates/builderbot-actions/src/executor.rs
index aaa432b8..c723e0b2 100644
--- a/crates/builderbot-actions/src/executor.rs
+++ b/crates/builderbot-actions/src/executor.rs
@@ -5,7 +5,7 @@
 
 use anyhow::{Context, Result};
 use async_trait::async_trait;
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::io::{Read, Write};
 use std::process::{Command, Stdio};
 use std::sync::{Arc, Mutex};
@@ -40,6 +40,9 @@ struct RunningActionState {
 pub struct ActionExecutor {
     running: Arc<Mutex<HashMap<String, RunningActionState>>>,
     completed: Arc<Mutex<HashMap<String, Vec<OutputChunk>>>>,
+    /// Tracks execution IDs that have been explicitly stopped by the user.
+    /// The completion thread checks this to emit `Stopped` instead of `Failed`.
+    stopped: Arc<Mutex<HashSet<String>>>,
 }
 
 impl Default for ActionExecutor {
@@ -54,6 +57,7 @@ impl ActionExecutor {
         Self {
             running: Arc::new(Mutex::new(HashMap::new())),
             completed: Arc::new(Mutex::new(HashMap::new())),
+            stopped: Arc::new(Mutex::new(HashSet::new())),
         }
     }
 
@@ -82,8 +86,8 @@ impl ActionExecutor {
         // 2. Login shell loads the full environment
         // 3. -s flag forces shell to read commands from stdin (critical for non-TTY context)
         // 4. Stdin commands execute AFTER shell initialization and hook activation
-        let mut child = Command::new(&shell)
-            .current_dir(&working_dir) // Start in target directory to trigger directory hooks
+        let mut cmd = Command::new(&shell);
+        cmd.current_dir(&working_dir) // Start in target directory to trigger directory hooks
             .env_clear() // Clear all inherited environment variables
             .env("HOME", std::env::var("HOME").unwrap_or_default()) // Preserve HOME for shell profile loading
             .env("USER", std::env::var("USER").unwrap_or_default()) // Preserve USER for shell profile loading
@@ -93,9 +97,27 @@ impl ActionExecutor {
             .arg("-s") // Force shell to read commands from stdin (required for non-TTY)
             .stdin(Stdio::piped()) // Pipe stdin to send commands after initialization
             .stdout(Stdio::piped())
-            .stderr(Stdio::piped())
-            .spawn()
-            .context("Failed to spawn action process")?;
+            .stderr(Stdio::piped());
+
+        // Create a new process group so we can kill the shell AND all its children.
+        // Without this, SIGTERM only reaches the shell process, leaving child
+        // processes (e.g. `npm run build`, `cargo build`) running as orphans.
+        #[cfg(unix)]
+        {
+            use std::os::unix::process::CommandExt;
+            // SAFETY: `setsid()` is an async-signal-safe function that creates a new
+            // session and process group. It is safe to call in a pre_exec hook.
+            unsafe {
+                cmd.pre_exec(|| {
+                    // Create a new session (and process group) for this child.
+                    // All processes spawned by the shell will inherit this group.
+                    libc::setsid();
+                    Ok(())
+                });
+            }
+        }
+
+        let mut child = cmd.spawn().context("Failed to spawn action process")?;
 
         let child_pid = child.id();
 
@@ -243,6 +265,7 @@ impl ActionExecutor {
         let exec_id = execution_id.clone();
         let running_clone = self.running.clone();
         let completed_clone = self.completed.clone();
+        let stopped_clone = self.stopped.clone();
         let working_dir_clone = working_dir.clone();
         let auto_commit = metadata.auto_commit;
         let action_name = metadata.action_name.clone();
@@ -253,6 +276,12 @@ impl ActionExecutor {
             let exit_code = exit_status.as_ref().ok().and_then(|s| s.code());
             let completed_at = now_timestamp();
 
+            // Check whether this execution was explicitly stopped by the user
+            let was_stopped = {
+                let mut stopped = stopped_clone.lock().unwrap();
+                stopped.remove(&exec_id)
+            };
+
             // Move output buffer to completed actions map and remove from running
             {
                 let mut running = running_clone.lock().unwrap();
@@ -265,8 +294,13 @@ impl ActionExecutor {
 
             let success = exit_status.as_ref().map(|s| s.success()).unwrap_or(false);
 
-            // Emit completion status
-            let status = if success {
+            // Determine the correct status:
+            // - If explicitly stopped by the user → Stopped
+            // - If exited successfully → Completed
+            // - Otherwise → Failed
+            let status = if was_stopped {
+                ActionStatus::Stopped
+            } else if success {
                 ActionStatus::Completed
             } else {
                 ActionStatus::Failed
@@ -286,7 +320,7 @@ impl ActionExecutor {
                         .await;
 
                     // If auto_commit is enabled and action succeeded, commit changes
-                    if auto_commit && success {
+                    if auto_commit && success && !was_stopped {
                         if let Err(e) = auto_commit_if_changes(&working_dir_clone, &action_name) {
                             eprintln!("Failed to auto-commit changes: {}", e);
                         } else {
@@ -346,49 +380,68 @@ impl ActionExecutor {
         Ok(execution_id)
     }
 
-    /// Stop a running action by execution ID
+    /// Stop a running action by execution ID.
+    ///
+    /// Marks the execution as stopped and sends SIGTERM to the entire process
+    /// group. The completion thread will see the stopped flag and emit a
+    /// `Stopped` status event once the process actually exits.
     pub fn stop(&self, execution_id: &str) -> Result<()> {
-        let state = {
-            let mut running = self.running.lock().unwrap();
-            running.remove(execution_id)
+        // Mark as stopped BEFORE sending the signal so the completion thread
+        // knows this was an intentional stop (not a crash/failure).
+        {
+            let mut stopped = self.stopped.lock().unwrap();
+            stopped.insert(execution_id.to_string());
+        }
+
+        // Read the PID but do NOT remove from the running map. The completion
+        // thread needs the entry to move the output buffer to the completed map
+        // and to emit the proper StatusChanged event.
+        let pid = {
+            let running = self.running.lock().unwrap();
+            running.get(execution_id).and_then(|state| state.child_pid)
         };
 
-        if let Some(state) = state {
-            if let Some(pid) = state.child_pid {
-                #[cfg(unix)]
-                {
-                    // SAFETY: This unsafe block calls libc::kill to terminate a child process.
-                    //
-                    // Safety considerations:
-                    // 1. PID validity: The PID comes from `std::process::Child::id()` which was
-                    //    stored when we spawned the process. While the process may have already
-                    //    terminated, calling kill() on a non-existent PID is safe (returns ESRCH).
-                    // 2. PID reuse: On Unix systems, PIDs can be reused after process termination.
-                    //    However, the window for reuse is typically small, and we only call this
-                    //    immediately after removing the process from our tracking map. The risk of
-                    //    terminating an unrelated process is minimal in practice.
-                    // 3. Signal delivery: SIGTERM is a graceful termination signal that allows
-                    //    processes to clean up. This is safer than SIGKILL.
-                    // 4. Error handling: We intentionally ignore the return value because:
-                    //    - If the process already exited, kill() fails with ESRCH (acceptable)
-                    //    - If we lack permissions, we can't do anything about it
-                    //    - The process is already removed from our tracking, so we've done our part
-                    //
-                    // Alternative considered: Using a higher-level library like `sysinfo` or maintaining
-                    // a handle to the Child. However, this adds complexity and dependencies without
-                    // significantly improving safety for this use case.
+        if let Some(pid) = pid {
+            #[cfg(unix)]
+            {
+                // Send SIGTERM to the entire process group (negative PID).
+                //
+                // Because we used `setsid()` in pre_exec, the shell and all its
+                // children share a process group whose PGID equals the shell's PID.
+                // Sending the signal to `-pid` reaches every process in the group,
+                // ensuring child processes (npm, cargo, etc.) are also terminated.
+                //
+                // SAFETY: Calling `libc::kill` with a negative PID targets a process
+                // group. The PID came from `Child::id()` at spawn time. If the group
+                // no longer exists, kill() returns ESRCH which we safely ignore.
+                // After SIGTERM, we spawn a background thread that waits briefly and
+                // escalates to SIGKILL if the group is still alive.
+                unsafe {
+                    libc::kill(-(pid as i32), libc::SIGTERM);
+                }
+
+                // Escalate to SIGKILL after a short grace period in case the
+                // process group ignores SIGTERM (e.g. a process traps the signal).
+                thread::spawn(move || {
+                    thread::sleep(std::time::Duration::from_secs(5));
+                    // SAFETY: Same considerations as above. If the process group
+                    // already exited, kill() harmlessly returns ESRCH.
                     unsafe {
-                        libc::kill(pid as i32, libc::SIGTERM);
+                        // Check if the process group still exists before sending SIGKILL
+                        let ret = libc::kill(-(pid as i32), 0);
+                        if ret == 0 {
+                            libc::kill(-(pid as i32), libc::SIGKILL);
+                        }
                     }
-                }
+                });
+            }
 
-                #[cfg(windows)]
-                {
-                    // Use taskkill on Windows for graceful termination
-                    let _ = Command::new("taskkill")
-                        .args(["/PID", &pid.to_string(), "/F"])
-                        .status();
-                }
+            #[cfg(windows)]
+            {
+                // Use taskkill with /T to kill the process tree on Windows
+                let _ = Command::new("taskkill")
+                    .args(["/PID", &pid.to_string(), "/T", "/F"])
+                    .status();
             }
         }
 
diff --git a/staged/src/lib/features/actions/ActionOutputModal.svelte b/staged/src/lib/features/actions/ActionOutputModal.svelte
index aff20c49..51385804 100644
--- a/staged/src/lib/features/actions/ActionOutputModal.svelte
+++ b/staged/src/lib/features/actions/ActionOutputModal.svelte
@@ -193,6 +193,10 @@
           if (event.exitCode !== undefined) {
             exitCode = event.exitCode;
           }
+          // Reset stopping flag once the backend confirms a terminal state
+          if (status !== 'running') {
+            stopping = false;
+          }
         }
       });
     } catch (e: any) {
@@ -220,11 +224,11 @@
     stopping = true;
     try {
       await stopBranchAction(executionId);
-      status = 'stopped';
+      // Don't set status here — the backend will emit a 'stopped' status
+      // event once the process actually exits, which our listener handles.
     } catch (e: any) {
       error = e?.message || 'Failed to stop action';
       console.error('Failed to stop action:', e);
-    } finally {
       stopping = false;
     }
   }
@@ -342,10 +346,10 @@
         {/if}
       </div>
       <div class="header-actions">
-        {#if isRunning && !stopping}
-          <button class="stop-btn" onclick={handleStop} title="Stop action">
+        {#if isRunning}
+          <button class="stop-btn" onclick={handleStop} disabled={stopping} title="Stop action">
             <CircleStop size={14} />
-            <span>Stop</span>
+            <span>{stopping ? 'Stopping…' : 'Stop'}</span>
           </button>
         {/if}
         {#if status === 'failed' && onRemove}
@@ -497,11 +501,16 @@
     transition: all 0.15s;
   }
 
-  .stop-btn:hover {
+  .stop-btn:hover:not(:disabled) {
     background: rgba(239, 68, 68, 0.15);
     border-color: rgba(239, 68, 68, 0.3);
   }
 
+  .stop-btn:disabled {
+    opacity: 0.6;
+    cursor: not-allowed;
+  }
+
   .remove-btn {
     display: flex;
     align-items: center;

From 39a7780f126f9ac8cd446346a384dcda01db4473 Mon Sep 17 00:00:00 2001
From: Matt Toohey <contact@matttoohey.com>
Date: Thu, 19 Feb 2026 11:42:23 -0800
Subject: [PATCH 2/2] fix: allow rerunning actions after stop by cleaning up
 stale execution state

Previously, after stopping a running action, clicking Run again would show
old output instead of starting a fresh execution. Three issues were fixed:

1. handleRunAction() matched any execution for the action regardless of
   status. Now it only treats status='running' as already-running; for
   stopped/failed/completed entries it clears the stale state and starts
   a new run.

2. Stopped and failed actions were never auto-removed from runningActions
   (only completed ones were). Now all terminal states (completed, failed,
   stopped) are auto-removed after a brief display delay.

3. When rerunning, old backend output buffers are cleaned up via
   clearActionExecution() before the new run starts, preventing stale
   output from appearing.

4. The primary action button only opens the output modal when the action
   is actively running; otherwise it triggers handleRunAction to start
   a fresh execution.
---
 .../lib/features/branches/BranchCard.svelte   | 67 ++++++++++++-------
 1 file changed, 42 insertions(+), 25 deletions(-)

diff --git a/staged/src/lib/features/branches/BranchCard.svelte b/staged/src/lib/features/branches/BranchCard.svelte
index 47ffde6c..166d3b96 100644
--- a/staged/src/lib/features/branches/BranchCard.svelte
+++ b/staged/src/lib/features/branches/BranchCard.svelte
@@ -60,6 +60,7 @@
   import {
     runBranchAction,
     getRunningBranchActions,
+    clearActionExecution,
     type ActionStatusEvent,
   } from '../actions/actions';
   import { getAvailableOpeners, openInApp, copyPathToClipboard, type OpenerApp } from './branch';
@@ -310,33 +311,35 @@
           runningActions[existingIndex].exitCode = payload.exitCode;
           runningActions[existingIndex].completedAt = payload.completedAt;
 
-          // Auto-remove successful completions (with fade for secondary, instant for primary)
+          // Auto-remove terminal states after a delay
+          const action = runningActions[existingIndex];
+          const isPrimaryAction = primaryRunAction && action.actionId === primaryRunAction.id;
+
+          // Determine delay based on status: completed shows briefly, stopped/failed show longer
+          let displayTime: number;
           if (payload.status === 'completed') {
-            const action = runningActions[existingIndex];
-            const isPrimaryAction = primaryRunAction && action.actionId === primaryRunAction.id;
+            displayTime = isPrimaryAction ? 1000 : 2000;
+          } else {
+            // stopped/failed: show status briefly then clean up so rerun works cleanly
+            displayTime = isPrimaryAction ? 2000 : 3000;
+          }
 
+          setTimeout(() => {
+            const foundAction = runningActions.find((a) => a.executionId === payload.executionId);
+            if (foundAction && !isPrimaryAction) {
+              // Secondary actions fade out
+              foundAction.fading = true;
+            }
+            // Remove after animation completes (or immediately for primary)
             setTimeout(
               () => {
-                const foundAction = runningActions.find(
-                  (a) => a.executionId === payload.executionId
+                runningActions = runningActions.filter(
+                  (a) => a.executionId !== payload.executionId
                 );
-                if (foundAction && !isPrimaryAction) {
-                  // Secondary actions fade out
-                  foundAction.fading = true;
-                }
-                // Remove after animation completes (or immediately for primary)
-                setTimeout(
-                  () => {
-                    runningActions = runningActions.filter(
-                      (a) => a.executionId !== payload.executionId
-                    );
-                  },
-                  isPrimaryAction ? 0 : 300
-                ); // Match CSS transition duration for secondary
               },
-              isPrimaryAction ? 1000 : 2000
-            ); // Shorter display time for primary action
-          }
+              isPrimaryAction ? 0 : 300
+            ); // Match CSS transition duration for secondary
+          }, displayTime);
         }
       }
     }).then((unlisten) => {
@@ -680,11 +683,13 @@
   async function handleRunAction(action: ProjectAction) {
     showMoreMenu = false;
 
-    // Check if this action is already running
-    const existingExecution = runningActions.find((a) => a.actionId === action.id);
+    // Check if this action is currently running
+    const existingExecution = runningActions.find(
+      (a) => a.actionId === action.id && a.status === 'running'
+    );
 
     if (existingExecution) {
-      // Action already running, open modal to view output
+      // Action is actively running, open modal to view output
       actionOutputModal = {
         executionId: existingExecution.executionId,
         actionName: action.name,
@@ -692,6 +697,18 @@
       return;
     }
 
+    // Remove any stale (stopped/failed/completed) entries for this action
+    // before starting a new run, and clean up their backend buffers
+    const staleExecutions = runningActions.filter(
+      (a) => a.actionId === action.id && a.status !== 'running'
+    );
+    for (const stale of staleExecutions) {
+      clearActionExecution(stale.executionId).catch(() => {});
+    }
+    runningActions = runningActions.filter(
+      (a) => !(a.actionId === action.id && a.status !== 'running')
+    );
+
     // Start the action silently (don't open modal)
     try {
       await runBranchAction(branch.id, action.id);
@@ -1367,7 +1384,7 @@
               class:completed={primaryActionExecution?.status === 'completed'}
               class:failed={primaryActionExecution?.status === 'failed'}
               onclick={() =>
-                primaryActionExecution
+                primaryActionExecution?.status === 'running'
                   ? handleShowActionOutput(primaryActionExecution)
                   : handleRunAction(primaryRunAction)}
               title={primaryRunAction.name}