From 1de875dae1ba0cf9bd0d9386768b9eeab53eff4c Mon Sep 17 00:00:00 2001
From: Algis Dumbris <a.dumbris@gmail.com>
Date: Sun, 21 Dec 2025 08:41:01 +0200
Subject: [PATCH 1/3] fix: Improve subprocess shutdown with proper timeout
 handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes #51

Changes:
- Add timeout constants: mcpClientCloseTimeout (10s), processGracefulTimeout (9s)
- Fix order: try graceful MCP client close FIRST, then force kill if needed
- Fix mutex: release before long I/O operations, reacquire for state updates
- Fix killProcessGroup: poll every 100ms instead of fixed 10s sleep
- Add early exit when process group is already dead

The 9s SIGTERM→SIGKILL timeout fits within the 10s MCP close timeout,
ensuring force kill completes before we give up on graceful close.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 internal/upstream/core/connection.go    | 156 +++++++++++-------------
 internal/upstream/core/process_unix.go  |  72 +++++++----
 internal/upstream/core/shutdown_test.go |  68 +++++++++++
 3 files changed, 187 insertions(+), 109 deletions(-)
 create mode 100644 internal/upstream/core/shutdown_test.go

diff --git a/internal/upstream/core/connection.go b/internal/upstream/core/connection.go
index 5302aa59..5337e621 100644
--- a/internal/upstream/core/connection.go
+++ b/internal/upstream/core/connection.go
@@ -28,6 +28,15 @@ const (
 
 	dockerCleanupTimeout = 30 * time.Second
 
+	// Subprocess shutdown timeouts
+	// mcpClientCloseTimeout is the max time to wait for graceful MCP client close
+	mcpClientCloseTimeout = 10 * time.Second
+	// processGracefulTimeout is the max time to wait after SIGTERM before SIGKILL
+	// Must be less than mcpClientCloseTimeout to complete within the close timeout
+	processGracefulTimeout = 9 * time.Second
+	// processTerminationPollInterval is how often to check if process exited
+	processTerminationPollInterval = 100 * time.Millisecond
+
 	// Transport types
 	transportHTTP           = "http"
 	transportHTTPStreamable = "streamable-http"
@@ -1962,137 +1971,116 @@ func (c *Client) Disconnect() error {
 
 // DisconnectWithContext closes the connection with context timeout
 func (c *Client) DisconnectWithContext(_ context.Context) error {
+	// Step 1: Read state under lock, then release for I/O operations
 	c.mu.Lock()
-	defer c.mu.Unlock()
-
-	// CRITICAL FIX: Always perform cleanup, even if not fully connected
-	// Servers in "Connecting" state may have Docker containers running
 	wasConnected := c.connected
+	mcpClient := c.client
+	isDocker := c.isDockerCommand
+	containerID := c.containerID
+	containerName := c.containerName
+	pgid := c.processGroupID
+	processCmd := c.processCmd
+	serverName := c.config.Name
+	c.mu.Unlock()
 
 	c.logger.Info("Disconnecting from upstream MCP server",
 		zap.Bool("was_connected", wasConnected))
 
-	// Log disconnection to server-specific log
 	if c.upstreamLogger != nil {
 		c.upstreamLogger.Info("Disconnecting from server",
 			zap.Bool("was_connected", wasConnected))
 	}
 
-	// Stop stderr monitoring before closing client
+	// Step 2: Stop monitoring (these have their own locks)
 	c.StopStderrMonitoring()
-
-	// Stop process monitoring before closing client
 	c.StopProcessMonitoring()
 
-	// For Docker containers, kill the container before closing the client
-	// IMPORTANT: Do this even if not fully connected, as containers may be running
-	if c.isDockerCommand {
+	// Step 3: For Docker containers, use Docker-specific cleanup
+	if isDocker {
 		c.logger.Debug("Disconnecting Docker command, attempting container cleanup",
-			zap.String("server", c.config.Name),
-			zap.Bool("has_container_id", c.containerID != ""))
+			zap.String("server", serverName),
+			zap.Bool("has_container_id", containerID != ""))
 
-		// Create a fresh context for Docker cleanup with its own timeout
-		// This ensures cleanup can complete even if the main context expires
 		cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), dockerCleanupTimeout)
 		defer cleanupCancel()
 
-		if c.containerID != "" {
+		if containerID != "" {
 			c.logger.Debug("Cleaning up Docker container by ID",
-				zap.String("server", c.config.Name),
-				zap.String("container_id", c.containerID))
+				zap.String("server", serverName),
+				zap.String("container_id", containerID))
 			c.killDockerContainerWithContext(cleanupCtx)
-			c.logger.Debug("Docker container cleanup by ID completed",
-				zap.String("server", c.config.Name))
-		} else if c.containerName != "" {
+		} else if containerName != "" {
 			c.logger.Debug("Cleaning up Docker container by name",
-				zap.String("server", c.config.Name),
-				zap.String("container_name", c.containerName))
-			c.killDockerContainerByNameWithContext(cleanupCtx, c.containerName)
-			c.logger.Debug("Docker container cleanup by name completed",
-				zap.String("server", c.config.Name))
+				zap.String("server", serverName),
+				zap.String("container_name", containerName))
+			c.killDockerContainerByNameWithContext(cleanupCtx, containerName)
 		} else {
-			c.logger.Debug("No container ID or name available, using pattern-based cleanup method",
-				zap.String("server", c.config.Name))
-			// Fallback: try to find and kill any containers started by this command
+			c.logger.Debug("No container ID or name, using pattern-based cleanup",
+				zap.String("server", serverName))
 			c.killDockerContainerByCommandWithContext(cleanupCtx)
-			c.logger.Debug("Docker fallback cleanup completed",
-				zap.String("server", c.config.Name))
 		}
-	} else {
-		c.logger.Debug("Non-Docker command disconnecting, no container cleanup needed",
-			zap.String("server", c.config.Name))
 	}
 
-	// CRITICAL FIX: Clean up process group to prevent zombie processes
-	if c.processGroupID > 0 {
-		c.logger.Info("Cleaning up process group to prevent zombie processes",
-			zap.String("server", c.config.Name),
-			zap.Int("pgid", c.processGroupID))
-
-		if err := killProcessGroup(c.processGroupID, c.logger, c.config.Name); err != nil {
-			c.logger.Error("Failed to clean up process group",
-				zap.String("server", c.config.Name),
-				zap.Int("pgid", c.processGroupID),
-				zap.Error(err))
-		} else {
-			c.logger.Info("Process group cleanup completed successfully",
-				zap.String("server", c.config.Name),
-				zap.Int("pgid", c.processGroupID))
-		}
-
-		// Reset process group ID after cleanup
-		c.processGroupID = 0
-	}
-
-	// Close MCP client if it exists
-	if c.client != nil {
-		c.logger.Debug("Closing MCP client connection",
-			zap.String("server", c.config.Name))
+	// Step 4: Try graceful close via MCP client FIRST
+	// This gives the subprocess a chance to exit cleanly via stdin/stdout close
+	gracefulCloseSucceeded := false
+	if mcpClient != nil {
+		c.logger.Debug("Attempting graceful MCP client close",
+			zap.String("server", serverName))
 
 		closeDone := make(chan struct{})
 		go func() {
-			c.client.Close()
+			mcpClient.Close()
 			close(closeDone)
 		}()
 
 		select {
 		case <-closeDone:
-			c.logger.Debug("MCP client connection closed",
-				zap.String("server", c.config.Name))
-		case <-time.After(2 * time.Second):
-			c.logger.Warn("MCP client close timed out, forcing shutdown",
-				zap.String("server", c.config.Name))
+			c.logger.Debug("MCP client closed gracefully",
+				zap.String("server", serverName))
+			gracefulCloseSucceeded = true
+		case <-time.After(mcpClientCloseTimeout):
+			c.logger.Warn("MCP client close timed out",
+				zap.String("server", serverName),
+				zap.Duration("timeout", mcpClientCloseTimeout))
+		}
+	}
 
-			// Attempt process group cleanup if available
-			if c.processGroupID > 0 {
-				_ = killProcessGroup(c.processGroupID, c.logger, c.config.Name)
-			}
+	// Step 5: Force kill process group only if graceful close failed
+	// For non-Docker stdio processes that didn't exit gracefully
+	if !gracefulCloseSucceeded && !isDocker && pgid > 0 {
+		c.logger.Info("Graceful close failed, force killing process group",
+			zap.String("server", serverName),
+			zap.Int("pgid", pgid))
 
-			if c.processCmd != nil && c.processCmd.Process != nil {
-				if err := c.processCmd.Process.Kill(); err != nil {
-					c.logger.Warn("Failed to kill stdio process after close timeout",
-						zap.String("server", c.config.Name),
-						zap.Error(err))
-				} else {
-					c.logger.Info("Killed stdio process after close timeout",
-						zap.String("server", c.config.Name))
-				}
+		if err := killProcessGroup(pgid, c.logger, serverName); err != nil {
+			c.logger.Error("Failed to kill process group",
+				zap.String("server", serverName),
+				zap.Int("pgid", pgid),
+				zap.Error(err))
+		}
+
+		// Also try direct process kill as last resort
+		if processCmd != nil && processCmd.Process != nil {
+			if err := processCmd.Process.Kill(); err != nil {
+				c.logger.Debug("Direct process kill failed (may already be dead)",
+					zap.String("server", serverName),
+					zap.Error(err))
 			}
 		}
-	} else {
-		c.logger.Debug("No MCP client to close (may be in connecting state)",
-			zap.String("server", c.config.Name))
 	}
 
+	// Step 6: Update state under lock
+	c.mu.Lock()
 	c.client = nil
 	c.serverInfo = nil
 	c.connected = false
-
-	// Clear cached tools on disconnect
 	c.cachedTools = nil
+	c.processGroupID = 0
+	c.mu.Unlock()
 
 	c.logger.Debug("Disconnect completed successfully",
-		zap.String("server", c.config.Name))
+		zap.String("server", serverName))
 	return nil
 }
 
diff --git a/internal/upstream/core/process_unix.go b/internal/upstream/core/process_unix.go
index 5efa3de4..c115859b 100644
--- a/internal/upstream/core/process_unix.go
+++ b/internal/upstream/core/process_unix.go
@@ -67,46 +67,68 @@ func killProcessGroup(pgid int, logger *zap.Logger, serverName string) error {
 		zap.String("server", serverName),
 		zap.Int("pgid", pgid))
 
-	// Step 1: Send SIGTERM to the entire process group
+	// Step 1: Check if process group is already dead
+	if err := syscall.Kill(-pgid, 0); err != nil {
+		logger.Debug("Process group already terminated",
+			zap.String("server", serverName),
+			zap.Int("pgid", pgid))
+		return nil
+	}
+
+	// Step 2: Send SIGTERM to the entire process group
 	err := syscall.Kill(-pgid, syscall.SIGTERM)
 	if err != nil {
 		logger.Warn("Failed to send SIGTERM to process group",
 			zap.String("server", serverName),
 			zap.Int("pgid", pgid),
 			zap.Error(err))
-	} else {
-		logger.Debug("SIGTERM sent to process group",
-			zap.String("server", serverName),
-			zap.Int("pgid", pgid))
+		// Process might have exited between check and kill, that's ok
+		return nil
 	}
 
-	// Step 2: Wait a bit for graceful termination
-	time.Sleep(2 * time.Second)
-
-	// Step 3: Check if processes are still running and send SIGKILL if needed
-	if err := syscall.Kill(-pgid, 0); err == nil {
-		// Processes still exist, force kill them
-		logger.Warn("Process group still running after SIGTERM, sending SIGKILL",
-			zap.String("server", serverName),
-			zap.Int("pgid", pgid))
+	logger.Debug("SIGTERM sent to process group",
+		zap.String("server", serverName),
+		zap.Int("pgid", pgid))
 
-		if killErr := syscall.Kill(-pgid, syscall.SIGKILL); killErr != nil {
-			logger.Error("Failed to send SIGKILL to process group",
+	// Step 3: Poll for graceful termination (up to processGracefulTimeout)
+	// This allows fast exit if process terminates quickly
+	deadline := time.Now().Add(processGracefulTimeout)
+	for time.Now().Before(deadline) {
+		if err := syscall.Kill(-pgid, 0); err != nil {
+			// Process group is dead
+			logger.Info("Process group terminated gracefully after SIGTERM",
 				zap.String("server", serverName),
-				zap.Int("pgid", pgid),
-				zap.Error(killErr))
-			return killErr
+				zap.Int("pgid", pgid))
+			return nil
 		}
+		time.Sleep(processTerminationPollInterval)
+	}
 
-		logger.Info("SIGKILL sent to process group",
-			zap.String("server", serverName),
-			zap.Int("pgid", pgid))
-	} else {
-		logger.Info("Process group terminated successfully",
+	// Step 4: Process still running, send SIGKILL
+	logger.Warn("Process group still running after SIGTERM timeout, sending SIGKILL",
+		zap.String("server", serverName),
+		zap.Int("pgid", pgid),
+		zap.Duration("timeout", processGracefulTimeout))
+
+	if killErr := syscall.Kill(-pgid, syscall.SIGKILL); killErr != nil {
+		// Check if it died between our check and kill
+		if syscall.Kill(-pgid, 0) != nil {
+			logger.Debug("Process group exited just before SIGKILL",
+				zap.String("server", serverName),
+				zap.Int("pgid", pgid))
+			return nil
+		}
+		logger.Error("Failed to send SIGKILL to process group",
 			zap.String("server", serverName),
-			zap.Int("pgid", pgid))
+			zap.Int("pgid", pgid),
+			zap.Error(killErr))
+		return killErr
 	}
 
+	logger.Info("SIGKILL sent to process group",
+		zap.String("server", serverName),
+		zap.Int("pgid", pgid))
+
 	return nil
 }
 
diff --git a/internal/upstream/core/shutdown_test.go b/internal/upstream/core/shutdown_test.go
new file mode 100644
index 00000000..f08e6cf6
--- /dev/null
+++ b/internal/upstream/core/shutdown_test.go
@@ -0,0 +1,68 @@
+//go:build unix
+
+package core
+
+import (
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"go.uber.org/zap"
+)
+
+func TestShutdownTimeoutConstants(t *testing.T) {
+	// Verify timeout relationship: processGracefulTimeout < mcpClientCloseTimeout
+	// This ensures killProcessGroup can complete within the MCP close timeout
+	assert.Less(t, processGracefulTimeout, mcpClientCloseTimeout,
+		"processGracefulTimeout (%v) must be less than mcpClientCloseTimeout (%v)",
+		processGracefulTimeout, mcpClientCloseTimeout)
+
+	// Verify reasonable values
+	assert.Equal(t, 10*time.Second, mcpClientCloseTimeout,
+		"mcpClientCloseTimeout should be 10 seconds")
+	assert.Equal(t, 9*time.Second, processGracefulTimeout,
+		"processGracefulTimeout should be 9 seconds")
+	assert.Equal(t, 100*time.Millisecond, processTerminationPollInterval,
+		"processTerminationPollInterval should be 100ms")
+}
+
+func TestKillProcessGroup_AlreadyDead(t *testing.T) {
+	// killProcessGroup should return quickly when process group doesn't exist
+	logger := zap.NewNop()
+
+	// Use a PGID that definitely doesn't exist (negative or very large)
+	nonExistentPGID := 999999999
+
+	start := time.Now()
+	err := killProcessGroup(nonExistentPGID, logger, "test-server")
+	elapsed := time.Since(start)
+
+	// Should return without error (process already dead is not an error)
+	assert.NoError(t, err)
+
+	// Should return quickly (not wait for full timeout)
+	assert.Less(t, elapsed, 1*time.Second,
+		"killProcessGroup should return quickly for non-existent process group, took %v", elapsed)
+}
+
+func TestKillProcessGroup_InvalidPGID(t *testing.T) {
+	logger := zap.NewNop()
+
+	// Zero PGID should return immediately
+	start := time.Now()
+	err := killProcessGroup(0, logger, "test-server")
+	elapsed := time.Since(start)
+
+	assert.NoError(t, err)
+	assert.Less(t, elapsed, 100*time.Millisecond,
+		"killProcessGroup with PGID=0 should return immediately")
+
+	// Negative PGID should return immediately
+	start = time.Now()
+	err = killProcessGroup(-1, logger, "test-server")
+	elapsed = time.Since(start)
+
+	assert.NoError(t, err)
+	assert.Less(t, elapsed, 100*time.Millisecond,
+		"killProcessGroup with negative PGID should return immediately")
+}

From ebe916be418cc86fcb4ac192652881f33383e760 Mon Sep 17 00:00:00 2001
From: Algis Dumbris <a.dumbris@gmail.com>
Date: Sun, 21 Dec 2025 10:27:47 +0200
Subject: [PATCH 2/3] docs: Add subprocess shutdown behavior documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add new docs/operations/shutdown-behavior.md with detailed shutdown flow
- Update docs/architecture.md with timeout table and subprocess flow
- Update docs/configuration/upstream-servers.md with process lifecycle section
- Update docs/features/docker-isolation.md with container lifecycle section

Documents the graceful → force shutdown approach:
- 10s MCP client close timeout
- 9s SIGTERM → SIGKILL timeout
- 30s Docker container cleanup timeout

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 docs/architecture.md                   |  14 +-
 docs/configuration/upstream-servers.md |  21 +++
 docs/features/docker-isolation.md      |  33 ++++
 docs/operations/shutdown-behavior.md   | 232 +++++++++++++++++++++++++
 4 files changed, 299 insertions(+), 1 deletion(-)
 create mode 100644 docs/operations/shutdown-behavior.md

diff --git a/docs/architecture.md b/docs/architecture.md
index 128e3ce8..9629bd0d 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -129,9 +129,21 @@ The event bus enables real-time communication between runtime and UI components:
 
 **Shutdown**:
 - Graceful context cancellation cascades to all background services
-- Upstream servers disconnected with proper Docker container cleanup
+- Upstream servers disconnected with proper subprocess and Docker container cleanup
 - Resources closed in dependency order (upstream → cache → index → storage)
 
+**Subprocess Shutdown Flow**:
+1. **Graceful Close** (10s timeout): Close MCP client connection, wait for subprocess to exit cleanly
+2. **Force Kill** (9s timeout): If graceful close fails, send SIGTERM to process group, poll for exit, then SIGKILL
+
+| Timeout | Value | Purpose |
+|---------|-------|---------|
+| MCP Client Close | 10s | Wait for graceful stdin/stdout close |
+| SIGTERM → SIGKILL | 9s | Time between graceful and force kill |
+| Docker Cleanup | 30s | Container stop/kill timeout |
+
+See [Shutdown Behavior](/operations/shutdown-behavior) for detailed documentation.
+
 ## Tray Application Architecture
 
 The tray application uses a robust state machine architecture for reliable core management.
diff --git a/docs/configuration/upstream-servers.md b/docs/configuration/upstream-servers.md
index 35b9ec02..c1fa87f3 100644
--- a/docs/configuration/upstream-servers.md
+++ b/docs/configuration/upstream-servers.md
@@ -95,6 +95,27 @@ For enhanced security, stdio servers can run in Docker containers:
 
 See [Docker Isolation](/features/docker-isolation) for complete documentation.
 
+## Process Lifecycle
+
+### Startup
+
+When MCPProxy starts, it:
+1. Loads server configurations from `mcp_config.json`
+2. Creates MCP clients for each enabled, non-quarantined server
+3. Connects to servers in the background (async)
+4. Indexes tools once connections are established
+
+### Shutdown
+
+When MCPProxy stops, it performs graceful shutdown of all subprocesses:
+
+1. **Graceful Close** (10s): Close MCP connection, wait for process to exit
+2. **Force Kill** (9s): If still running, SIGTERM → poll → SIGKILL
+
+**Process groups**: Child processes (spawned by npm/npx/uvx) are placed in a process group, ensuring all related processes are terminated together.
+
+See [Shutdown Behavior](/operations/shutdown-behavior) for detailed documentation.
+
 ## Quarantine System
 
 New servers added via AI clients are automatically quarantined for security review. See [Security Quarantine](/features/security-quarantine) for details.
diff --git a/docs/features/docker-isolation.md b/docs/features/docker-isolation.md
index 17af98fc..9a5b820a 100644
--- a/docs/features/docker-isolation.md
+++ b/docs/features/docker-isolation.md
@@ -226,6 +226,39 @@ docker stats
 - Check container logs for specific error messages
 - Verify network access for package repositories
 
+## Container Lifecycle
+
+### Startup
+
+When a Docker-isolated server starts:
+1. MCPProxy detects runtime type (npm, uvx, python, etc.)
+2. Selects appropriate Docker image
+3. Runs container with stdio transport (`docker run -i`)
+4. Establishes MCP connection via stdin/stdout
+
+### Shutdown
+
+When MCPProxy stops, containers are cleaned up with a 30-second timeout:
+
+1. **Graceful Stop**: `docker stop` (sends SIGTERM to container)
+2. **Force Kill**: `docker kill` if container doesn't stop gracefully
+
+Containers are labeled with `mcpproxy.managed=true` for identification.
+
+### Manual Cleanup
+
+If containers remain after MCPProxy stops:
+
+```bash
+# List MCPProxy-managed containers
+docker ps --filter "label=mcpproxy.managed=true"
+
+# Remove all MCPProxy containers
+docker rm -f $(docker ps -q --filter "label=mcpproxy.managed=true")
+```
+
+See [Shutdown Behavior](/operations/shutdown-behavior) for detailed subprocess lifecycle documentation.
+
 ## Security Considerations
 
 Docker isolation provides strong security boundaries but consider:
diff --git a/docs/operations/shutdown-behavior.md b/docs/operations/shutdown-behavior.md
new file mode 100644
index 00000000..420bc95d
--- /dev/null
+++ b/docs/operations/shutdown-behavior.md
@@ -0,0 +1,232 @@
+---
+id: shutdown-behavior
+title: Shutdown Behavior
+sidebar_label: Shutdown Behavior
+sidebar_position: 1
+description: How MCPProxy handles graceful shutdown of upstream servers and subprocesses
+keywords: [shutdown, graceful, sigterm, sigkill, process, cleanup, timeout]
+---
+
+# Shutdown Behavior
+
+This document describes how MCPProxy handles graceful shutdown of upstream servers, including subprocess termination timeouts and cleanup procedures.
+
+## Overview
+
+When MCPProxy shuts down (via Ctrl+C, SIGTERM, or tray quit), it follows a structured cleanup process:
+
+1. Cancel application context (signals all background services to stop)
+2. Stop OAuth refresh manager
+3. Stop Supervisor (reconciliation loop)
+4. Shutdown all upstream servers (graceful → force)
+5. Close caches, indexes, and storage
+
+## Subprocess Shutdown Flow
+
+For stdio-based MCP servers (processes started via `command`), MCPProxy uses a two-phase shutdown approach:
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    Graceful Close Phase                          │
+│                      (10 seconds max)                            │
+├─────────────────────────────────────────────────────────────────┤
+│  1. Close MCP client connection (stdin/stdout)                  │
+│  2. Subprocess receives EOF and should exit cleanly             │
+│  3. Wait up to 10 seconds for graceful exit                     │
+└─────────────────────────────────────────────────────────────────┘
+                              │
+                              ▼ (if timeout)
+┌─────────────────────────────────────────────────────────────────┐
+│                    Force Kill Phase                              │
+│                      (9 seconds max)                             │
+├─────────────────────────────────────────────────────────────────┤
+│  1. Send SIGTERM to entire process group                        │
+│  2. Poll every 100ms to check if process exited                 │
+│  3. After 9 seconds: send SIGKILL (force kill)                  │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+## Timeout Constants
+
+| Constant | Value | Description |
+|----------|-------|-------------|
+| `mcpClientCloseTimeout` | 10s | Max time to wait for graceful MCP client close |
+| `processGracefulTimeout` | 9s | Max time after SIGTERM before SIGKILL |
+| `processTerminationPollInterval` | 100ms | How often to check if process exited |
+| `dockerCleanupTimeout` | 30s | Max time for Docker container cleanup |
+
+### Why 9 seconds for SIGTERM?
+
+The SIGTERM timeout (9s) is intentionally less than the MCP client close timeout (10s). This ensures that if graceful close times out, the force kill phase can complete within a reasonable time window.
+
+**Total worst case for stdio servers:** 10s (graceful) + 9s (force kill) = 19 seconds
+
+## Docker Container Shutdown
+
+Docker containers follow a similar pattern but use Docker's native stop mechanism:
+
+1. `docker stop` (sends SIGTERM, waits for graceful exit)
+2. If container doesn't stop: `docker kill` (sends SIGKILL)
+
+Docker cleanup has a 30-second timeout to allow for container-specific cleanup procedures.
+
+## Process Groups
+
+MCPProxy uses Unix process groups to ensure all child processes are properly cleaned up:
+
+```go
+// All child processes are placed in a new process group
+cmd.SysProcAttr = &syscall.SysProcAttr{
+    Setpgid: true,  // Create new process group
+    Pgid:    0,     // Make this process the group leader
+}
+```
+
+When shutting down, MCPProxy sends signals to the entire process group (`-pgid`), ensuring that:
+- Child processes spawned by npm/npx are terminated
+- Orphaned processes don't accumulate
+- All related processes receive the shutdown signal
+
+## What Happens During Shutdown
+
+### When `call_tool` is called during shutdown
+
+If an AI client tries to call a tool while MCPProxy is shutting down:
+
+```
+Error: "Server 'xxx' is not connected (state: Disconnected)"
+```
+
+Or if the server client was already removed:
+
+```
+Error: "No client found for server: xxx"
+```
+
+### When `retrieve_tools` is called during shutdown
+
+- If the search index is still open: Returns results (possibly stale)
+- After index is closed: Returns an error
+
+### When `tools/list_changed` notification arrives during shutdown
+
+The notification is safely ignored:
+- Callback context is cancelled
+- Discovery doesn't block shutdown
+- Logged as a warning, no user impact
+
+## Shutdown Order
+
+```
+Runtime.Close()
+    │
+    ├─► Cancel app context
+    │
+    ├─► Stop OAuth refresh manager
+    │       └─► Prevents token refresh during shutdown
+    │
+    ├─► Stop Supervisor
+    │       ├─► Cancel reconciliation context
+    │       ├─► Wait for goroutines to exit
+    │       └─► Close upstream adapter
+    │
+    ├─► ShutdownAll on upstream manager (45s total timeout)
+    │       └─► For each server (parallel):
+    │               ├─► Graceful close (10s)
+    │               └─► Force kill if needed (9s)
+    │
+    ├─► Close cache manager
+    │
+    ├─► Close index manager
+    │
+    ├─► Close storage manager
+    │
+    └─► Close config service
+```
+
+## Debugging Shutdown Issues
+
+### Check for orphaned processes
+
+```bash
+# After stopping MCPProxy, check for orphaned MCP server processes
+pgrep -f "npx.*mcp"
+pgrep -f "uvx.*mcp"
+pgrep -f "node.*server"
+
+# If found, kill them manually
+pkill -f "npx.*mcp"
+```
+
+### Enable debug logging for shutdown
+
+```bash
+mcpproxy serve --log-level=debug 2>&1 | grep -E "(Disconnect|shutdown|SIGTERM|SIGKILL|process group)"
+```
+
+### View shutdown logs
+
+Look for these log messages during shutdown:
+
+```
+INFO  Disconnecting from upstream MCP server
+DEBUG Attempting graceful MCP client close
+DEBUG MCP client closed gracefully               # Success!
+# OR
+WARN  MCP client close timed out                 # Graceful failed
+INFO  Graceful close failed, force killing process group
+DEBUG SIGTERM sent to process group
+INFO  Process group terminated gracefully        # SIGTERM worked
+# OR
+WARN  Process group still running after SIGTERM, sending SIGKILL
+INFO  SIGKILL sent to process group
+```
+
+## Troubleshooting
+
+### Server processes not terminating
+
+**Symptoms:** `npx` or `uvx` processes remain running after MCPProxy stops.
+
+**Possible causes:**
+1. Process ignoring SIGTERM (bad signal handling in MCP server)
+2. Process group not properly set up
+3. Zombie processes from previous crashes
+
+**Solutions:**
+- Check server logs: `mcpproxy upstream logs <server-name>`
+- Manually kill orphaned processes
+- Report issue if consistently reproducible
+
+### Shutdown taking too long
+
+**Symptoms:** MCPProxy takes 20+ seconds to shut down.
+
+**Possible causes:**
+1. Many servers running in parallel
+2. Servers not responding to graceful shutdown
+3. Docker containers with slow cleanup
+
+**Solutions:**
+- Check which servers are slow: enable debug logging
+- Consider disabling problematic servers before shutdown
+- Report consistently slow servers as bugs
+
+### Docker containers not cleaning up
+
+**Symptoms:** Docker containers remain running after MCPProxy stops.
+
+**Solutions:**
+```bash
+# List MCPProxy containers
+docker ps --filter "label=mcpproxy.managed=true"
+
+# Force remove all MCPProxy containers
+docker rm -f $(docker ps -q --filter "label=mcpproxy.managed=true")
+```
+
+## Related Documentation
+
+- [Architecture](/architecture) - Runtime and lifecycle overview
+- [Docker Isolation](/features/docker-isolation) - Container management
+- [Upstream Servers](/configuration/upstream-servers) - Server configuration

From 5d7324add3fe2430e165ac3f8076d20d8147de62 Mon Sep 17 00:00:00 2001
From: Algis Dumbris <a.dumbris@gmail.com>
Date: Sun, 21 Dec 2025 10:30:24 +0200
Subject: [PATCH 3/3] docs: Add Operations section to Docusaurus site
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add operations/ to docusaurus.config.js include list
- Add Operations category to sidebars.js
- Update development/architecture.md with shutdown info
- Fix broken link in shutdown-behavior.md (/architecture → /development/architecture)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 docs/development/architecture.md     | 15 +++++++++++++++
 docs/operations/shutdown-behavior.md |  2 +-
 website/docusaurus.config.js         |  2 ++
 website/sidebars.js                  |  7 +++++++
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/docs/development/architecture.md b/docs/development/architecture.md
index 4571d62d..7da89e5b 100644
--- a/docs/development/architecture.md
+++ b/docs/development/architecture.md
@@ -86,4 +86,19 @@ Disconnected → Connecting → Authenticating → Ready
                     (on error)
 ```
 
+## Subprocess Shutdown
+
+When MCPProxy stops, subprocesses are terminated using a two-phase approach:
+
+1. **Graceful Close** (10s): Close MCP connection, wait for process to exit
+2. **Force Kill** (9s): If still running, SIGTERM → poll → SIGKILL
+
+| Timeout | Value | Purpose |
+|---------|-------|---------|
+| MCP Client Close | 10s | Wait for graceful stdin/stdout close |
+| SIGTERM → SIGKILL | 9s | Time between graceful and force kill |
+| Docker Cleanup | 30s | Container stop/kill timeout |
+
+See [Shutdown Behavior](/operations/shutdown-behavior) for detailed documentation.
+
 For complete architecture details, see [docs/architecture.md](https://github.com/smart-mcp-proxy/mcpproxy-go/blob/main/docs/architecture.md) in the repository.
diff --git a/docs/operations/shutdown-behavior.md b/docs/operations/shutdown-behavior.md
index 420bc95d..7265c75f 100644
--- a/docs/operations/shutdown-behavior.md
+++ b/docs/operations/shutdown-behavior.md
@@ -227,6 +227,6 @@ docker rm -f $(docker ps -q --filter "label=mcpproxy.managed=true")
 
 ## Related Documentation
 
-- [Architecture](/architecture) - Runtime and lifecycle overview
+- [Architecture](/development/architecture) - Runtime and lifecycle overview
 - [Docker Isolation](/features/docker-isolation) - Container management
 - [Upstream Servers](/configuration/upstream-servers) - Server configuration
diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js
index e0dcd5d8..646718e0 100644
--- a/website/docusaurus.config.js
+++ b/website/docusaurus.config.js
@@ -48,6 +48,7 @@ const config = {
             'api/**/*.{md,mdx}',
             'web-ui/**/*.{md,mdx}',
             'features/**/*.{md,mdx}',
+            'operations/**/*.{md,mdx}',
             'development/**/*.{md,mdx}',
             'contributing.md',
           ],
@@ -75,6 +76,7 @@ const config = {
           'api/*',
           'web-ui/*',
           'features/*',
+          'operations/*',
         ],
       },
     ],
diff --git a/website/sidebars.js b/website/sidebars.js
index 06cc9ae0..033cd118 100644
--- a/website/sidebars.js
+++ b/website/sidebars.js
@@ -68,6 +68,13 @@ const sidebars = {
         'features/version-updates',
       ],
     },
+    {
+      type: 'category',
+      label: 'Operations',
+      items: [
+        'operations/shutdown-behavior',
+      ],
+    },
     {
       type: 'category',
       label: 'Development',