From 1de875dae1ba0cf9bd0d9386768b9eeab53eff4c Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Sun, 21 Dec 2025 08:41:01 +0200 Subject: [PATCH 1/3] fix: Improve subprocess shutdown with proper timeout handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #51 Changes: - Add timeout constants: mcpClientCloseTimeout (10s), processGracefulTimeout (9s) - Fix order: try graceful MCP client close FIRST, then force kill if needed - Fix mutex: release before long I/O operations, reacquire for state updates - Fix killProcessGroup: poll every 100ms instead of fixed 10s sleep - Add early exit when process group is already dead The 9s SIGTERMβ†’SIGKILL timeout fits within the 10s MCP close timeout, ensuring force kill completes before we give up on graceful close. πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- internal/upstream/core/connection.go | 156 +++++++++++------------- internal/upstream/core/process_unix.go | 72 +++++++---- internal/upstream/core/shutdown_test.go | 68 +++++++++++ 3 files changed, 187 insertions(+), 109 deletions(-) create mode 100644 internal/upstream/core/shutdown_test.go diff --git a/internal/upstream/core/connection.go b/internal/upstream/core/connection.go index 5302aa59..5337e621 100644 --- a/internal/upstream/core/connection.go +++ b/internal/upstream/core/connection.go @@ -28,6 +28,15 @@ const ( dockerCleanupTimeout = 30 * time.Second + // Subprocess shutdown timeouts + // mcpClientCloseTimeout is the max time to wait for graceful MCP client close + mcpClientCloseTimeout = 10 * time.Second + // processGracefulTimeout is the max time to wait after SIGTERM before SIGKILL + // Must be less than mcpClientCloseTimeout to complete within the close timeout + processGracefulTimeout = 9 * time.Second + // processTerminationPollInterval is how often to check if process exited + processTerminationPollInterval = 100 * time.Millisecond + // Transport types transportHTTP = "http" transportHTTPStreamable = "streamable-http" @@ -1962,137 +1971,116 @@ func (c *Client) Disconnect() error { // DisconnectWithContext closes the connection with context timeout func (c *Client) DisconnectWithContext(_ context.Context) error { + // Step 1: Read state under lock, then release for I/O operations c.mu.Lock() - defer c.mu.Unlock() - - // CRITICAL FIX: Always perform cleanup, even if not fully connected - // Servers in "Connecting" state may have Docker containers running wasConnected := c.connected + mcpClient := c.client + isDocker := c.isDockerCommand + containerID := c.containerID + containerName := c.containerName + pgid := c.processGroupID + processCmd := c.processCmd + serverName := c.config.Name + c.mu.Unlock() c.logger.Info("Disconnecting from upstream MCP server", zap.Bool("was_connected", wasConnected)) - // Log disconnection to server-specific log if c.upstreamLogger != nil { c.upstreamLogger.Info("Disconnecting from server", zap.Bool("was_connected", wasConnected)) } - // Stop stderr monitoring before closing client + // Step 2: Stop monitoring (these have their own locks) c.StopStderrMonitoring() - - // Stop process monitoring before closing client c.StopProcessMonitoring() - // For Docker containers, kill the container before closing the client - // IMPORTANT: Do this even if not fully connected, as containers may be running - if c.isDockerCommand { + // Step 3: For Docker containers, use Docker-specific cleanup + if isDocker { c.logger.Debug("Disconnecting Docker command, attempting container cleanup", - zap.String("server", c.config.Name), - zap.Bool("has_container_id", c.containerID != "")) + zap.String("server", serverName), + zap.Bool("has_container_id", containerID != "")) - // Create a fresh context for Docker cleanup with its own timeout - // This ensures cleanup can complete even if the main context expires cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), dockerCleanupTimeout) defer cleanupCancel() - if c.containerID != "" { + if containerID != "" { c.logger.Debug("Cleaning up Docker container by ID", - zap.String("server", c.config.Name), - zap.String("container_id", c.containerID)) + zap.String("server", serverName), + zap.String("container_id", containerID)) c.killDockerContainerWithContext(cleanupCtx) - c.logger.Debug("Docker container cleanup by ID completed", - zap.String("server", c.config.Name)) - } else if c.containerName != "" { + } else if containerName != "" { c.logger.Debug("Cleaning up Docker container by name", - zap.String("server", c.config.Name), - zap.String("container_name", c.containerName)) - c.killDockerContainerByNameWithContext(cleanupCtx, c.containerName) - c.logger.Debug("Docker container cleanup by name completed", - zap.String("server", c.config.Name)) + zap.String("server", serverName), + zap.String("container_name", containerName)) + c.killDockerContainerByNameWithContext(cleanupCtx, containerName) } else { - c.logger.Debug("No container ID or name available, using pattern-based cleanup method", - zap.String("server", c.config.Name)) - // Fallback: try to find and kill any containers started by this command + c.logger.Debug("No container ID or name, using pattern-based cleanup", + zap.String("server", serverName)) c.killDockerContainerByCommandWithContext(cleanupCtx) - c.logger.Debug("Docker fallback cleanup completed", - zap.String("server", c.config.Name)) } - } else { - c.logger.Debug("Non-Docker command disconnecting, no container cleanup needed", - zap.String("server", c.config.Name)) } - // CRITICAL FIX: Clean up process group to prevent zombie processes - if c.processGroupID > 0 { - c.logger.Info("Cleaning up process group to prevent zombie processes", - zap.String("server", c.config.Name), - zap.Int("pgid", c.processGroupID)) - - if err := killProcessGroup(c.processGroupID, c.logger, c.config.Name); err != nil { - c.logger.Error("Failed to clean up process group", - zap.String("server", c.config.Name), - zap.Int("pgid", c.processGroupID), - zap.Error(err)) - } else { - c.logger.Info("Process group cleanup completed successfully", - zap.String("server", c.config.Name), - zap.Int("pgid", c.processGroupID)) - } - - // Reset process group ID after cleanup - c.processGroupID = 0 - } - - // Close MCP client if it exists - if c.client != nil { - c.logger.Debug("Closing MCP client connection", - zap.String("server", c.config.Name)) + // Step 4: Try graceful close via MCP client FIRST + // This gives the subprocess a chance to exit cleanly via stdin/stdout close + gracefulCloseSucceeded := false + if mcpClient != nil { + c.logger.Debug("Attempting graceful MCP client close", + zap.String("server", serverName)) closeDone := make(chan struct{}) go func() { - c.client.Close() + mcpClient.Close() close(closeDone) }() select { case <-closeDone: - c.logger.Debug("MCP client connection closed", - zap.String("server", c.config.Name)) - case <-time.After(2 * time.Second): - c.logger.Warn("MCP client close timed out, forcing shutdown", - zap.String("server", c.config.Name)) + c.logger.Debug("MCP client closed gracefully", + zap.String("server", serverName)) + gracefulCloseSucceeded = true + case <-time.After(mcpClientCloseTimeout): + c.logger.Warn("MCP client close timed out", + zap.String("server", serverName), + zap.Duration("timeout", mcpClientCloseTimeout)) + } + } - // Attempt process group cleanup if available - if c.processGroupID > 0 { - _ = killProcessGroup(c.processGroupID, c.logger, c.config.Name) - } + // Step 5: Force kill process group only if graceful close failed + // For non-Docker stdio processes that didn't exit gracefully + if !gracefulCloseSucceeded && !isDocker && pgid > 0 { + c.logger.Info("Graceful close failed, force killing process group", + zap.String("server", serverName), + zap.Int("pgid", pgid)) - if c.processCmd != nil && c.processCmd.Process != nil { - if err := c.processCmd.Process.Kill(); err != nil { - c.logger.Warn("Failed to kill stdio process after close timeout", - zap.String("server", c.config.Name), - zap.Error(err)) - } else { - c.logger.Info("Killed stdio process after close timeout", - zap.String("server", c.config.Name)) - } + if err := killProcessGroup(pgid, c.logger, serverName); err != nil { + c.logger.Error("Failed to kill process group", + zap.String("server", serverName), + zap.Int("pgid", pgid), + zap.Error(err)) + } + + // Also try direct process kill as last resort + if processCmd != nil && processCmd.Process != nil { + if err := processCmd.Process.Kill(); err != nil { + c.logger.Debug("Direct process kill failed (may already be dead)", + zap.String("server", serverName), + zap.Error(err)) } } - } else { - c.logger.Debug("No MCP client to close (may be in connecting state)", - zap.String("server", c.config.Name)) } + // Step 6: Update state under lock + c.mu.Lock() c.client = nil c.serverInfo = nil c.connected = false - - // Clear cached tools on disconnect c.cachedTools = nil + c.processGroupID = 0 + c.mu.Unlock() c.logger.Debug("Disconnect completed successfully", - zap.String("server", c.config.Name)) + zap.String("server", serverName)) return nil } diff --git a/internal/upstream/core/process_unix.go b/internal/upstream/core/process_unix.go index 5efa3de4..c115859b 100644 --- a/internal/upstream/core/process_unix.go +++ b/internal/upstream/core/process_unix.go @@ -67,46 +67,68 @@ func killProcessGroup(pgid int, logger *zap.Logger, serverName string) error { zap.String("server", serverName), zap.Int("pgid", pgid)) - // Step 1: Send SIGTERM to the entire process group + // Step 1: Check if process group is already dead + if err := syscall.Kill(-pgid, 0); err != nil { + logger.Debug("Process group already terminated", + zap.String("server", serverName), + zap.Int("pgid", pgid)) + return nil + } + + // Step 2: Send SIGTERM to the entire process group err := syscall.Kill(-pgid, syscall.SIGTERM) if err != nil { logger.Warn("Failed to send SIGTERM to process group", zap.String("server", serverName), zap.Int("pgid", pgid), zap.Error(err)) - } else { - logger.Debug("SIGTERM sent to process group", - zap.String("server", serverName), - zap.Int("pgid", pgid)) + // Process might have exited between check and kill, that's ok + return nil } - // Step 2: Wait a bit for graceful termination - time.Sleep(2 * time.Second) - - // Step 3: Check if processes are still running and send SIGKILL if needed - if err := syscall.Kill(-pgid, 0); err == nil { - // Processes still exist, force kill them - logger.Warn("Process group still running after SIGTERM, sending SIGKILL", - zap.String("server", serverName), - zap.Int("pgid", pgid)) + logger.Debug("SIGTERM sent to process group", + zap.String("server", serverName), + zap.Int("pgid", pgid)) - if killErr := syscall.Kill(-pgid, syscall.SIGKILL); killErr != nil { - logger.Error("Failed to send SIGKILL to process group", + // Step 3: Poll for graceful termination (up to processGracefulTimeout) + // This allows fast exit if process terminates quickly + deadline := time.Now().Add(processGracefulTimeout) + for time.Now().Before(deadline) { + if err := syscall.Kill(-pgid, 0); err != nil { + // Process group is dead + logger.Info("Process group terminated gracefully after SIGTERM", zap.String("server", serverName), - zap.Int("pgid", pgid), - zap.Error(killErr)) - return killErr + zap.Int("pgid", pgid)) + return nil } + time.Sleep(processTerminationPollInterval) + } - logger.Info("SIGKILL sent to process group", - zap.String("server", serverName), - zap.Int("pgid", pgid)) - } else { - logger.Info("Process group terminated successfully", + // Step 4: Process still running, send SIGKILL + logger.Warn("Process group still running after SIGTERM timeout, sending SIGKILL", + zap.String("server", serverName), + zap.Int("pgid", pgid), + zap.Duration("timeout", processGracefulTimeout)) + + if killErr := syscall.Kill(-pgid, syscall.SIGKILL); killErr != nil { + // Check if it died between our check and kill + if syscall.Kill(-pgid, 0) != nil { + logger.Debug("Process group exited just before SIGKILL", + zap.String("server", serverName), + zap.Int("pgid", pgid)) + return nil + } + logger.Error("Failed to send SIGKILL to process group", zap.String("server", serverName), - zap.Int("pgid", pgid)) + zap.Int("pgid", pgid), + zap.Error(killErr)) + return killErr } + logger.Info("SIGKILL sent to process group", + zap.String("server", serverName), + zap.Int("pgid", pgid)) + return nil } diff --git a/internal/upstream/core/shutdown_test.go b/internal/upstream/core/shutdown_test.go new file mode 100644 index 00000000..f08e6cf6 --- /dev/null +++ b/internal/upstream/core/shutdown_test.go @@ -0,0 +1,68 @@ +//go:build unix + +package core + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "go.uber.org/zap" +) + +func TestShutdownTimeoutConstants(t *testing.T) { + // Verify timeout relationship: processGracefulTimeout < mcpClientCloseTimeout + // This ensures killProcessGroup can complete within the MCP close timeout + assert.Less(t, processGracefulTimeout, mcpClientCloseTimeout, + "processGracefulTimeout (%v) must be less than mcpClientCloseTimeout (%v)", + processGracefulTimeout, mcpClientCloseTimeout) + + // Verify reasonable values + assert.Equal(t, 10*time.Second, mcpClientCloseTimeout, + "mcpClientCloseTimeout should be 10 seconds") + assert.Equal(t, 9*time.Second, processGracefulTimeout, + "processGracefulTimeout should be 9 seconds") + assert.Equal(t, 100*time.Millisecond, processTerminationPollInterval, + "processTerminationPollInterval should be 100ms") +} + +func TestKillProcessGroup_AlreadyDead(t *testing.T) { + // killProcessGroup should return quickly when process group doesn't exist + logger := zap.NewNop() + + // Use a PGID that definitely doesn't exist (negative or very large) + nonExistentPGID := 999999999 + + start := time.Now() + err := killProcessGroup(nonExistentPGID, logger, "test-server") + elapsed := time.Since(start) + + // Should return without error (process already dead is not an error) + assert.NoError(t, err) + + // Should return quickly (not wait for full timeout) + assert.Less(t, elapsed, 1*time.Second, + "killProcessGroup should return quickly for non-existent process group, took %v", elapsed) +} + +func TestKillProcessGroup_InvalidPGID(t *testing.T) { + logger := zap.NewNop() + + // Zero PGID should return immediately + start := time.Now() + err := killProcessGroup(0, logger, "test-server") + elapsed := time.Since(start) + + assert.NoError(t, err) + assert.Less(t, elapsed, 100*time.Millisecond, + "killProcessGroup with PGID=0 should return immediately") + + // Negative PGID should return immediately + start = time.Now() + err = killProcessGroup(-1, logger, "test-server") + elapsed = time.Since(start) + + assert.NoError(t, err) + assert.Less(t, elapsed, 100*time.Millisecond, + "killProcessGroup with negative PGID should return immediately") +} From ebe916be418cc86fcb4ac192652881f33383e760 Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Sun, 21 Dec 2025 10:27:47 +0200 Subject: [PATCH 2/3] docs: Add subprocess shutdown behavior documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add new docs/operations/shutdown-behavior.md with detailed shutdown flow - Update docs/architecture.md with timeout table and subprocess flow - Update docs/configuration/upstream-servers.md with process lifecycle section - Update docs/features/docker-isolation.md with container lifecycle section Documents the graceful β†’ force shutdown approach: - 10s MCP client close timeout - 9s SIGTERM β†’ SIGKILL timeout - 30s Docker container cleanup timeout πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- docs/architecture.md | 14 +- docs/configuration/upstream-servers.md | 21 +++ docs/features/docker-isolation.md | 33 ++++ docs/operations/shutdown-behavior.md | 232 +++++++++++++++++++++++++ 4 files changed, 299 insertions(+), 1 deletion(-) create mode 100644 docs/operations/shutdown-behavior.md diff --git a/docs/architecture.md b/docs/architecture.md index 128e3ce8..9629bd0d 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -129,9 +129,21 @@ The event bus enables real-time communication between runtime and UI components: **Shutdown**: - Graceful context cancellation cascades to all background services -- Upstream servers disconnected with proper Docker container cleanup +- Upstream servers disconnected with proper subprocess and Docker container cleanup - Resources closed in dependency order (upstream β†’ cache β†’ index β†’ storage) +**Subprocess Shutdown Flow**: +1. **Graceful Close** (10s timeout): Close MCP client connection, wait for subprocess to exit cleanly +2. **Force Kill** (9s timeout): If graceful close fails, send SIGTERM to process group, poll for exit, then SIGKILL + +| Timeout | Value | Purpose | +|---------|-------|---------| +| MCP Client Close | 10s | Wait for graceful stdin/stdout close | +| SIGTERM β†’ SIGKILL | 9s | Time between graceful and force kill | +| Docker Cleanup | 30s | Container stop/kill timeout | + +See [Shutdown Behavior](/operations/shutdown-behavior) for detailed documentation. + ## Tray Application Architecture The tray application uses a robust state machine architecture for reliable core management. diff --git a/docs/configuration/upstream-servers.md b/docs/configuration/upstream-servers.md index 35b9ec02..c1fa87f3 100644 --- a/docs/configuration/upstream-servers.md +++ b/docs/configuration/upstream-servers.md @@ -95,6 +95,27 @@ For enhanced security, stdio servers can run in Docker containers: See [Docker Isolation](/features/docker-isolation) for complete documentation. +## Process Lifecycle + +### Startup + +When MCPProxy starts, it: +1. Loads server configurations from `mcp_config.json` +2. Creates MCP clients for each enabled, non-quarantined server +3. Connects to servers in the background (async) +4. Indexes tools once connections are established + +### Shutdown + +When MCPProxy stops, it performs graceful shutdown of all subprocesses: + +1. **Graceful Close** (10s): Close MCP connection, wait for process to exit +2. **Force Kill** (9s): If still running, SIGTERM β†’ poll β†’ SIGKILL + +**Process groups**: Child processes (spawned by npm/npx/uvx) are placed in a process group, ensuring all related processes are terminated together. + +See [Shutdown Behavior](/operations/shutdown-behavior) for detailed documentation. + ## Quarantine System New servers added via AI clients are automatically quarantined for security review. See [Security Quarantine](/features/security-quarantine) for details. diff --git a/docs/features/docker-isolation.md b/docs/features/docker-isolation.md index 17af98fc..9a5b820a 100644 --- a/docs/features/docker-isolation.md +++ b/docs/features/docker-isolation.md @@ -226,6 +226,39 @@ docker stats - Check container logs for specific error messages - Verify network access for package repositories +## Container Lifecycle + +### Startup + +When a Docker-isolated server starts: +1. MCPProxy detects runtime type (npm, uvx, python, etc.) +2. Selects appropriate Docker image +3. Runs container with stdio transport (`docker run -i`) +4. Establishes MCP connection via stdin/stdout + +### Shutdown + +When MCPProxy stops, containers are cleaned up with a 30-second timeout: + +1. **Graceful Stop**: `docker stop` (sends SIGTERM to container) +2. **Force Kill**: `docker kill` if container doesn't stop gracefully + +Containers are labeled with `mcpproxy.managed=true` for identification. + +### Manual Cleanup + +If containers remain after MCPProxy stops: + +```bash +# List MCPProxy-managed containers +docker ps --filter "label=mcpproxy.managed=true" + +# Remove all MCPProxy containers +docker rm -f $(docker ps -q --filter "label=mcpproxy.managed=true") +``` + +See [Shutdown Behavior](/operations/shutdown-behavior) for detailed subprocess lifecycle documentation. + ## Security Considerations Docker isolation provides strong security boundaries but consider: diff --git a/docs/operations/shutdown-behavior.md b/docs/operations/shutdown-behavior.md new file mode 100644 index 00000000..420bc95d --- /dev/null +++ b/docs/operations/shutdown-behavior.md @@ -0,0 +1,232 @@ +--- +id: shutdown-behavior +title: Shutdown Behavior +sidebar_label: Shutdown Behavior +sidebar_position: 1 +description: How MCPProxy handles graceful shutdown of upstream servers and subprocesses +keywords: [shutdown, graceful, sigterm, sigkill, process, cleanup, timeout] +--- + +# Shutdown Behavior + +This document describes how MCPProxy handles graceful shutdown of upstream servers, including subprocess termination timeouts and cleanup procedures. + +## Overview + +When MCPProxy shuts down (via Ctrl+C, SIGTERM, or tray quit), it follows a structured cleanup process: + +1. Cancel application context (signals all background services to stop) +2. Stop OAuth refresh manager +3. Stop Supervisor (reconciliation loop) +4. Shutdown all upstream servers (graceful β†’ force) +5. Close caches, indexes, and storage + +## Subprocess Shutdown Flow + +For stdio-based MCP servers (processes started via `command`), MCPProxy uses a two-phase shutdown approach: + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Graceful Close Phase β”‚ +β”‚ (10 seconds max) β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ 1. Close MCP client connection (stdin/stdout) β”‚ +β”‚ 2. Subprocess receives EOF and should exit cleanly β”‚ +β”‚ 3. Wait up to 10 seconds for graceful exit β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό (if timeout) +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Force Kill Phase β”‚ +β”‚ (9 seconds max) β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ 1. Send SIGTERM to entire process group β”‚ +β”‚ 2. Poll every 100ms to check if process exited β”‚ +β”‚ 3. After 9 seconds: send SIGKILL (force kill) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +## Timeout Constants + +| Constant | Value | Description | +|----------|-------|-------------| +| `mcpClientCloseTimeout` | 10s | Max time to wait for graceful MCP client close | +| `processGracefulTimeout` | 9s | Max time after SIGTERM before SIGKILL | +| `processTerminationPollInterval` | 100ms | How often to check if process exited | +| `dockerCleanupTimeout` | 30s | Max time for Docker container cleanup | + +### Why 9 seconds for SIGTERM? + +The SIGTERM timeout (9s) is intentionally less than the MCP client close timeout (10s). This ensures that if graceful close times out, the force kill phase can complete within a reasonable time window. + +**Total worst case for stdio servers:** 10s (graceful) + 9s (force kill) = 19 seconds + +## Docker Container Shutdown + +Docker containers follow a similar pattern but use Docker's native stop mechanism: + +1. `docker stop` (sends SIGTERM, waits for graceful exit) +2. If container doesn't stop: `docker kill` (sends SIGKILL) + +Docker cleanup has a 30-second timeout to allow for container-specific cleanup procedures. + +## Process Groups + +MCPProxy uses Unix process groups to ensure all child processes are properly cleaned up: + +```go +// All child processes are placed in a new process group +cmd.SysProcAttr = &syscall.SysProcAttr{ + Setpgid: true, // Create new process group + Pgid: 0, // Make this process the group leader +} +``` + +When shutting down, MCPProxy sends signals to the entire process group (`-pgid`), ensuring that: +- Child processes spawned by npm/npx are terminated +- Orphaned processes don't accumulate +- All related processes receive the shutdown signal + +## What Happens During Shutdown + +### When `call_tool` is called during shutdown + +If an AI client tries to call a tool while MCPProxy is shutting down: + +``` +Error: "Server 'xxx' is not connected (state: Disconnected)" +``` + +Or if the server client was already removed: + +``` +Error: "No client found for server: xxx" +``` + +### When `retrieve_tools` is called during shutdown + +- If the search index is still open: Returns results (possibly stale) +- After index is closed: Returns an error + +### When `tools/list_changed` notification arrives during shutdown + +The notification is safely ignored: +- Callback context is cancelled +- Discovery doesn't block shutdown +- Logged as a warning, no user impact + +## Shutdown Order + +``` +Runtime.Close() + β”‚ + β”œβ”€β–Ί Cancel app context + β”‚ + β”œβ”€β–Ί Stop OAuth refresh manager + β”‚ └─► Prevents token refresh during shutdown + β”‚ + β”œβ”€β–Ί Stop Supervisor + β”‚ β”œβ”€β–Ί Cancel reconciliation context + β”‚ β”œβ”€β–Ί Wait for goroutines to exit + β”‚ └─► Close upstream adapter + β”‚ + β”œβ”€β–Ί ShutdownAll on upstream manager (45s total timeout) + β”‚ └─► For each server (parallel): + β”‚ β”œβ”€β–Ί Graceful close (10s) + β”‚ └─► Force kill if needed (9s) + β”‚ + β”œβ”€β–Ί Close cache manager + β”‚ + β”œβ”€β–Ί Close index manager + β”‚ + β”œβ”€β–Ί Close storage manager + β”‚ + └─► Close config service +``` + +## Debugging Shutdown Issues + +### Check for orphaned processes + +```bash +# After stopping MCPProxy, check for orphaned MCP server processes +pgrep -f "npx.*mcp" +pgrep -f "uvx.*mcp" +pgrep -f "node.*server" + +# If found, kill them manually +pkill -f "npx.*mcp" +``` + +### Enable debug logging for shutdown + +```bash +mcpproxy serve --log-level=debug 2>&1 | grep -E "(Disconnect|shutdown|SIGTERM|SIGKILL|process group)" +``` + +### View shutdown logs + +Look for these log messages during shutdown: + +``` +INFO Disconnecting from upstream MCP server +DEBUG Attempting graceful MCP client close +DEBUG MCP client closed gracefully # Success! +# OR +WARN MCP client close timed out # Graceful failed +INFO Graceful close failed, force killing process group +DEBUG SIGTERM sent to process group +INFO Process group terminated gracefully # SIGTERM worked +# OR +WARN Process group still running after SIGTERM, sending SIGKILL +INFO SIGKILL sent to process group +``` + +## Troubleshooting + +### Server processes not terminating + +**Symptoms:** `npx` or `uvx` processes remain running after MCPProxy stops. + +**Possible causes:** +1. Process ignoring SIGTERM (bad signal handling in MCP server) +2. Process group not properly set up +3. Zombie processes from previous crashes + +**Solutions:** +- Check server logs: `mcpproxy upstream logs ` +- Manually kill orphaned processes +- Report issue if consistently reproducible + +### Shutdown taking too long + +**Symptoms:** MCPProxy takes 20+ seconds to shut down. + +**Possible causes:** +1. Many servers running in parallel +2. Servers not responding to graceful shutdown +3. Docker containers with slow cleanup + +**Solutions:** +- Check which servers are slow: enable debug logging +- Consider disabling problematic servers before shutdown +- Report consistently slow servers as bugs + +### Docker containers not cleaning up + +**Symptoms:** Docker containers remain running after MCPProxy stops. + +**Solutions:** +```bash +# List MCPProxy containers +docker ps --filter "label=mcpproxy.managed=true" + +# Force remove all MCPProxy containers +docker rm -f $(docker ps -q --filter "label=mcpproxy.managed=true") +``` + +## Related Documentation + +- [Architecture](/architecture) - Runtime and lifecycle overview +- [Docker Isolation](/features/docker-isolation) - Container management +- [Upstream Servers](/configuration/upstream-servers) - Server configuration From 5d7324add3fe2430e165ac3f8076d20d8147de62 Mon Sep 17 00:00:00 2001 From: Algis Dumbris Date: Sun, 21 Dec 2025 10:30:24 +0200 Subject: [PATCH 3/3] docs: Add Operations section to Docusaurus site MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add operations/ to docusaurus.config.js include list - Add Operations category to sidebars.js - Update development/architecture.md with shutdown info - Fix broken link in shutdown-behavior.md (/architecture β†’ /development/architecture) πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- docs/development/architecture.md | 15 +++++++++++++++ docs/operations/shutdown-behavior.md | 2 +- website/docusaurus.config.js | 2 ++ website/sidebars.js | 7 +++++++ 4 files changed, 25 insertions(+), 1 deletion(-) diff --git a/docs/development/architecture.md b/docs/development/architecture.md index 4571d62d..7da89e5b 100644 --- a/docs/development/architecture.md +++ b/docs/development/architecture.md @@ -86,4 +86,19 @@ Disconnected β†’ Connecting β†’ Authenticating β†’ Ready (on error) ``` +## Subprocess Shutdown + +When MCPProxy stops, subprocesses are terminated using a two-phase approach: + +1. **Graceful Close** (10s): Close MCP connection, wait for process to exit +2. **Force Kill** (9s): If still running, SIGTERM β†’ poll β†’ SIGKILL + +| Timeout | Value | Purpose | +|---------|-------|---------| +| MCP Client Close | 10s | Wait for graceful stdin/stdout close | +| SIGTERM β†’ SIGKILL | 9s | Time between graceful and force kill | +| Docker Cleanup | 30s | Container stop/kill timeout | + +See [Shutdown Behavior](/operations/shutdown-behavior) for detailed documentation. + For complete architecture details, see [docs/architecture.md](https://github.com/smart-mcp-proxy/mcpproxy-go/blob/main/docs/architecture.md) in the repository. diff --git a/docs/operations/shutdown-behavior.md b/docs/operations/shutdown-behavior.md index 420bc95d..7265c75f 100644 --- a/docs/operations/shutdown-behavior.md +++ b/docs/operations/shutdown-behavior.md @@ -227,6 +227,6 @@ docker rm -f $(docker ps -q --filter "label=mcpproxy.managed=true") ## Related Documentation -- [Architecture](/architecture) - Runtime and lifecycle overview +- [Architecture](/development/architecture) - Runtime and lifecycle overview - [Docker Isolation](/features/docker-isolation) - Container management - [Upstream Servers](/configuration/upstream-servers) - Server configuration diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index e0dcd5d8..646718e0 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -48,6 +48,7 @@ const config = { 'api/**/*.{md,mdx}', 'web-ui/**/*.{md,mdx}', 'features/**/*.{md,mdx}', + 'operations/**/*.{md,mdx}', 'development/**/*.{md,mdx}', 'contributing.md', ], @@ -75,6 +76,7 @@ const config = { 'api/*', 'web-ui/*', 'features/*', + 'operations/*', ], }, ], diff --git a/website/sidebars.js b/website/sidebars.js index 06cc9ae0..033cd118 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -68,6 +68,13 @@ const sidebars = { 'features/version-updates', ], }, + { + type: 'category', + label: 'Operations', + items: [ + 'operations/shutdown-behavior', + ], + }, { type: 'category', label: 'Development',