Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 63 additions & 3 deletions internal/launcher/launcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ type connectionResult struct {
err error
}

// ServerState represents the observed runtime state of a backend server.
type ServerState struct {
Status string // "running" | "stopped" | "error"
StartedAt time.Time // zero value means never started
LastError string // most recent error message, if any
}

// Launcher manages backend MCP server connections
type Launcher struct {
ctx context.Context
Expand All @@ -35,6 +42,8 @@ type Launcher struct {
runningInContainer bool
startupTimeout time.Duration // Timeout for backend server startup
oidcProvider *oidc.Provider
serverStartTimes map[string]time.Time // tracks when each server was successfully launched
serverErrors map[string]string // tracks the most recent error per server
}

// New creates a new Launcher
Expand Down Expand Up @@ -84,6 +93,8 @@ func New(ctx context.Context, cfg *config.Config) *Launcher {
runningInContainer: inContainer,
startupTimeout: startupTimeout,
oidcProvider: oidcProvider,
serverStartTimes: make(map[string]time.Time),
serverErrors: make(map[string]string),
}
}

Expand Down Expand Up @@ -120,6 +131,7 @@ func GetOrLaunch(l *Launcher, serverID string) (*mcp.Connection, error) {
if oidcProvider == nil {
logger.LogErrorWithServer(serverID, "backend",
"Server %q requires OIDC auth but ACTIONS_ID_TOKEN_REQUEST_URL is not set", serverID)
l.recordError(serverID, "OIDC provider not available")
return nil, fmt.Errorf(
"server %q requires OIDC authentication but ACTIONS_ID_TOKEN_REQUEST_URL is not set; "+
"OIDC auth is only available in GitHub Actions with `permissions: { id-token: write }`",
Expand All @@ -133,17 +145,25 @@ func GetOrLaunch(l *Launcher, serverID string) (*mcp.Connection, error) {
logger.LogErrorWithServer(serverID, "backend", "Failed to create HTTP connection: %s, error=%v", serverID, err)
log.Printf("[LAUNCHER] ❌ FAILED to create HTTP connection for '%s'", serverID)
log.Printf("[LAUNCHER] Error: %v", err)
l.recordError(serverID, err.Error())
return nil, fmt.Errorf("failed to create HTTP connection: %w", err)
}

logger.LogInfoWithServer(serverID, "backend", "Successfully configured HTTP MCP backend: %s", serverID)
log.Printf("[LAUNCHER] Successfully configured HTTP backend: %s", serverID)

l.recordStart(serverID)
return conn, nil
}

// stdio backends from this point
return l.launchStdioConnection(serverID, "", serverCfg)
conn, err := l.launchStdioConnection(serverID, "", serverCfg)
if err != nil {
l.recordError(serverID, err.Error())
return nil, err
}
l.recordStart(serverID)
return conn, nil
})
}

Expand Down Expand Up @@ -190,13 +210,15 @@ func GetOrLaunchForSession(l *Launcher, serverID, sessionID string) (*mcp.Connec

conn, err := l.launchStdioConnection(serverID, sessionID, serverCfg)
if err != nil {
// Record error in session pool
// Record error in session pool and server state
l.sessionPool.RecordError(serverID, sessionID)
l.recordError(serverID, err.Error())
return nil, err
}

// Add to session pool
// Add to session pool and record start
l.sessionPool.Set(serverID, sessionID, conn)
l.recordStart(serverID)
return conn, nil
}

Expand Down Expand Up @@ -283,3 +305,41 @@ func (l *Launcher) Close() {
}
logLauncher.Print("Launcher closed successfully")
}

// recordStart records a successful server launch time.
// Only the first successful launch is recorded; subsequent calls are no-ops.
func (l *Launcher) recordStart(serverID string) {
if _, exists := l.serverStartTimes[serverID]; !exists {
l.serverStartTimes[serverID] = time.Now()
delete(l.serverErrors, serverID)
logLauncher.Printf("Recorded server start: serverID=%s", serverID)
}
}

// recordError records a server launch failure.
func (l *Launcher) recordError(serverID string, errMsg string) {
l.serverErrors[serverID] = errMsg
logLauncher.Printf("Recorded server error: serverID=%s, error=%s", serverID, errMsg)
Comment on lines +319 to +322
Copy link

Copilot AI Mar 31, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

recordError logs the raw errMsg into logs. Since errMsg can include untrusted/secret-bearing content (e.g., backend stderr or URLs with tokens), this can leak sensitive data into launcher logs. Sanitize/truncate the error string before logging (or log a stable error code/type and keep the full error only in memory).

Suggested change
// recordError records a server launch failure.
func (l *Launcher) recordError(serverID string, errMsg string) {
l.serverErrors[serverID] = errMsg
logLauncher.Printf("Recorded server error: serverID=%s, error=%s", serverID, errMsg)
const maxLoggedErrorLength = 512
// truncateErrorForLog returns a version of the error message that is safe to log by
// limiting its length. The full error is still kept in memory (e.g., in serverErrors).
func truncateErrorForLog(errMsg string) string {
if len(errMsg) <= maxLoggedErrorLength {
return errMsg
}
return errMsg[:maxLoggedErrorLength] + "...(truncated)"
}
// recordError records a server launch failure.
func (l *Launcher) recordError(serverID string, errMsg string) {
// Store the full error message in memory for internal use.
l.serverErrors[serverID] = errMsg
// Log only a truncated version to reduce the risk of leaking sensitive data.
safeErrMsg := truncateErrorForLog(errMsg)
logLauncher.Printf("Recorded server error: serverID=%s, error=%s", serverID, safeErrMsg)

Copilot uses AI. Check for mistakes.
}

// GetServerState returns the observed runtime state for a single server.
func (l *Launcher) GetServerState(serverID string) ServerState {
l.mu.RLock()
defer l.mu.RUnlock()

if errMsg, hasErr := l.serverErrors[serverID]; hasErr {
return ServerState{
Status: "error",
LastError: errMsg,
}
}

if startedAt, ok := l.serverStartTimes[serverID]; ok {
return ServerState{
Status: "running",
StartedAt: startedAt,
}
}

return ServerState{Status: "stopped"}
}
Comment on lines +325 to +345
Copy link

Copilot AI Mar 31, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GetServerState derives "running" solely from serverStartTimes, but nothing clears/updates that state when connections are closed (e.g., Launcher.Close() or session pool cleanup). This can cause /health to report servers as running with increasing uptime even after shutdown/close. Consider clearing serverStartTimes/serverErrors in Close(), and/or recording a "stopped" state when a connection is closed so the reported status reflects reality.

Copilot uses AI. Check for mistakes.
100 changes: 100 additions & 0 deletions internal/launcher/launcher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -773,3 +773,103 @@ func TestLauncher_MixedAuthTypes(t *testing.T) {
assert.NotNil(t, l, "Launcher should be created even with OIDC servers and missing env vars")
assert.Nil(t, l.oidcProvider, "OIDC provider should be nil without env vars")
}

func TestGetServerState_StoppedByDefault(t *testing.T) {
cfg := newTestConfig(map[string]*config.ServerConfig{
"test-server": {Type: "stdio", Command: "echo"},
})
l := New(context.Background(), cfg)
defer l.Close()

state := l.GetServerState("test-server")
assert.Equal(t, "stopped", state.Status)
assert.True(t, state.StartedAt.IsZero())
assert.Empty(t, state.LastError)
}

func TestGetServerState_UnknownServer(t *testing.T) {
cfg := newTestConfig(map[string]*config.ServerConfig{})
l := New(context.Background(), cfg)
defer l.Close()

state := l.GetServerState("nonexistent")
assert.Equal(t, "stopped", state.Status)
}

func TestRecordStart_SetsRunningState(t *testing.T) {
cfg := newTestConfig(map[string]*config.ServerConfig{
"test-server": {Type: "stdio", Command: "echo"},
})
l := New(context.Background(), cfg)
defer l.Close()

l.recordStart("test-server")

state := l.GetServerState("test-server")
assert.Equal(t, "running", state.Status)
assert.False(t, state.StartedAt.IsZero())
assert.Empty(t, state.LastError)
}

func TestRecordStart_OnlyRecordsFirstLaunch(t *testing.T) {
cfg := newTestConfig(map[string]*config.ServerConfig{
"test-server": {Type: "stdio", Command: "echo"},
})
l := New(context.Background(), cfg)
defer l.Close()

l.recordStart("test-server")
firstStart := l.serverStartTimes["test-server"]

// Second call should be a no-op
l.recordStart("test-server")
secondStart := l.serverStartTimes["test-server"]
Comment on lines +822 to +826
Copy link

Copilot AI Mar 31, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test reaches into l.serverStartTimes directly to assert behavior. That couples the test to the Launcher's internal representation and bypasses the public surface (GetServerState). Prefer asserting via GetServerState().StartedAt across two recordStart calls so internal refactors (e.g., switching to a different storage/locking strategy) don't break the test unnecessarily.

Suggested change
firstStart := l.serverStartTimes["test-server"]
// Second call should be a no-op
l.recordStart("test-server")
secondStart := l.serverStartTimes["test-server"]
firstState := l.GetServerState("test-server")
firstStart := firstState.StartedAt
// Second call should be a no-op
l.recordStart("test-server")
secondState := l.GetServerState("test-server")
secondStart := secondState.StartedAt

Copilot uses AI. Check for mistakes.

assert.Equal(t, firstStart, secondStart)
}

func TestRecordError_SetsErrorState(t *testing.T) {
cfg := newTestConfig(map[string]*config.ServerConfig{
"test-server": {Type: "stdio", Command: "echo"},
})
l := New(context.Background(), cfg)
defer l.Close()

l.recordError("test-server", "connection refused")

state := l.GetServerState("test-server")
assert.Equal(t, "error", state.Status)
assert.Equal(t, "connection refused", state.LastError)
}

func TestRecordStart_ClearsError(t *testing.T) {
cfg := newTestConfig(map[string]*config.ServerConfig{
"test-server": {Type: "stdio", Command: "echo"},
})
l := New(context.Background(), cfg)
defer l.Close()

l.recordError("test-server", "connection refused")
state := l.GetServerState("test-server")
assert.Equal(t, "error", state.Status)

l.recordStart("test-server")
state = l.GetServerState("test-server")
assert.Equal(t, "running", state.Status)
assert.Empty(t, state.LastError)
}

func TestGetServerState_ErrorTakesPrecedenceOverStart(t *testing.T) {
cfg := newTestConfig(map[string]*config.ServerConfig{
"test-server": {Type: "stdio", Command: "echo"},
})
l := New(context.Background(), cfg)
defer l.Close()

l.recordStart("test-server")
l.recordError("test-server", "server crashed")

state := l.GetServerState("test-server")
assert.Equal(t, "error", state.Status)
assert.Equal(t, "server crashed", state.LastError)
}
42 changes: 42 additions & 0 deletions internal/server/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -465,3 +465,45 @@ func TestBuildHealthResponse_HealthyStatus(t *testing.T) {
assert.NotEmpty(response.GatewayVersion, "gatewayVersion should not be empty")
assert.NotNil(response.Servers, "Servers map should not be nil")
}

// TestGetServerStatus_ErrorOnFailedLaunch verifies servers report "error" when launch fails
func TestGetServerStatus_ErrorOnFailedLaunch(t *testing.T) {
assert := assert.New(t)
require := require.New(t)

cfg := &config.Config{
Servers: map[string]*config.ServerConfig{
"server-a": {Type: "stdio", Command: "docker", Args: []string{"run", "test"}},
},
}

ctx := context.Background()
us, err := NewUnified(ctx, cfg)
require.NoError(err)
t.Cleanup(func() { us.Close() })

status := us.GetServerStatus()
require.Contains(status, "server-a")
assert.Equal("error", status["server-a"].Status, "Server should report 'error' after failed launch")
assert.Equal(0, status["server-a"].Uptime, "Uptime should be 0 for errored server")
}

// TestBuildHealthResponse_UnhealthyOnError verifies that failed servers make the gateway unhealthy
func TestBuildHealthResponse_UnhealthyOnError(t *testing.T) {
assert := assert.New(t)
require := require.New(t)

cfg := &config.Config{
Servers: map[string]*config.ServerConfig{
"server-a": {Type: "stdio", Command: "docker", Args: []string{"run", "test"}},
},
}

ctx := context.Background()
us, err := NewUnified(ctx, cfg)
require.NoError(err)
t.Cleanup(func() { us.Close() })

response := BuildHealthResponse(us)
assert.Equal("unhealthy", response.Status, "Gateway should be unhealthy when a server has errors")
}
Comment on lines +474 to +509
Copy link

Copilot AI Mar 31, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These tests call NewUnified(), which triggers tool registration and will attempt to launch the configured stdio backend. Using docker run test can hang/pull images and then wait up to the default 60s startup timeout, making the unit test slow/flaky depending on the environment. Use a deterministic fast-fail command (e.g., a clearly nonexistent binary like other launcher tests do) and/or set cfg.Gateway.StartupTimeout to a very small value in the test.

Copilot uses AI. Check for mistakes.
15 changes: 7 additions & 8 deletions internal/server/unified.go
Original file line number Diff line number Diff line change
Expand Up @@ -585,18 +585,17 @@ func (us *UnifiedServer) GetServerIDs() []string {
func (us *UnifiedServer) GetServerStatus() map[string]ServerStatus {
status := make(map[string]ServerStatus)

// Get all configured servers
serverIDs := us.launcher.ServerIDs()

for _, serverID := range serverIDs {
// Check if server has been launched by checking launcher connections
// For now, we'll return "running" for all configured servers
// and track uptime from when the gateway started
// This is a simple implementation - a more sophisticated version
// would track actual connection state per server
state := us.launcher.GetServerState(serverID)
uptime := 0
if !state.StartedAt.IsZero() {
uptime = int(time.Since(state.StartedAt).Seconds())
}
status[serverID] = ServerStatus{
Status: "running",
Uptime: 0, // Will be properly tracked when servers are actually launched
Status: state.Status,
Uptime: uptime,
}
}

Expand Down
Loading