-
Notifications
You must be signed in to change notification settings - Fork 21
fix: track real server status and uptime in health endpoint #2938
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,13 @@ type connectionResult struct { | |
| err error | ||
| } | ||
|
|
||
| // ServerState represents the observed runtime state of a backend server. | ||
| type ServerState struct { | ||
| Status string // "running" | "stopped" | "error" | ||
| StartedAt time.Time // zero value means never started | ||
| LastError string // most recent error message, if any | ||
| } | ||
|
|
||
| // Launcher manages backend MCP server connections | ||
| type Launcher struct { | ||
| ctx context.Context | ||
|
|
@@ -35,6 +42,8 @@ type Launcher struct { | |
| runningInContainer bool | ||
| startupTimeout time.Duration // Timeout for backend server startup | ||
| oidcProvider *oidc.Provider | ||
| serverStartTimes map[string]time.Time // tracks when each server was successfully launched | ||
| serverErrors map[string]string // tracks the most recent error per server | ||
| } | ||
|
|
||
| // New creates a new Launcher | ||
|
|
@@ -84,6 +93,8 @@ func New(ctx context.Context, cfg *config.Config) *Launcher { | |
| runningInContainer: inContainer, | ||
| startupTimeout: startupTimeout, | ||
| oidcProvider: oidcProvider, | ||
| serverStartTimes: make(map[string]time.Time), | ||
| serverErrors: make(map[string]string), | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -120,6 +131,7 @@ func GetOrLaunch(l *Launcher, serverID string) (*mcp.Connection, error) { | |
| if oidcProvider == nil { | ||
| logger.LogErrorWithServer(serverID, "backend", | ||
| "Server %q requires OIDC auth but ACTIONS_ID_TOKEN_REQUEST_URL is not set", serverID) | ||
| l.recordError(serverID, "OIDC provider not available") | ||
| return nil, fmt.Errorf( | ||
| "server %q requires OIDC authentication but ACTIONS_ID_TOKEN_REQUEST_URL is not set; "+ | ||
| "OIDC auth is only available in GitHub Actions with `permissions: { id-token: write }`", | ||
|
|
@@ -133,17 +145,25 @@ func GetOrLaunch(l *Launcher, serverID string) (*mcp.Connection, error) { | |
| logger.LogErrorWithServer(serverID, "backend", "Failed to create HTTP connection: %s, error=%v", serverID, err) | ||
| log.Printf("[LAUNCHER] ❌ FAILED to create HTTP connection for '%s'", serverID) | ||
| log.Printf("[LAUNCHER] Error: %v", err) | ||
| l.recordError(serverID, err.Error()) | ||
| return nil, fmt.Errorf("failed to create HTTP connection: %w", err) | ||
| } | ||
|
|
||
| logger.LogInfoWithServer(serverID, "backend", "Successfully configured HTTP MCP backend: %s", serverID) | ||
| log.Printf("[LAUNCHER] Successfully configured HTTP backend: %s", serverID) | ||
|
|
||
| l.recordStart(serverID) | ||
| return conn, nil | ||
| } | ||
|
|
||
| // stdio backends from this point | ||
| return l.launchStdioConnection(serverID, "", serverCfg) | ||
| conn, err := l.launchStdioConnection(serverID, "", serverCfg) | ||
| if err != nil { | ||
| l.recordError(serverID, err.Error()) | ||
| return nil, err | ||
| } | ||
| l.recordStart(serverID) | ||
| return conn, nil | ||
| }) | ||
| } | ||
|
|
||
|
|
@@ -190,13 +210,15 @@ func GetOrLaunchForSession(l *Launcher, serverID, sessionID string) (*mcp.Connec | |
|
|
||
| conn, err := l.launchStdioConnection(serverID, sessionID, serverCfg) | ||
| if err != nil { | ||
| // Record error in session pool | ||
| // Record error in session pool and server state | ||
| l.sessionPool.RecordError(serverID, sessionID) | ||
| l.recordError(serverID, err.Error()) | ||
| return nil, err | ||
| } | ||
|
|
||
| // Add to session pool | ||
| // Add to session pool and record start | ||
| l.sessionPool.Set(serverID, sessionID, conn) | ||
| l.recordStart(serverID) | ||
| return conn, nil | ||
| } | ||
|
|
||
|
|
@@ -283,3 +305,41 @@ func (l *Launcher) Close() { | |
| } | ||
| logLauncher.Print("Launcher closed successfully") | ||
| } | ||
|
|
||
| // recordStart records a successful server launch time. | ||
| // Only the first successful launch is recorded; subsequent calls are no-ops. | ||
| func (l *Launcher) recordStart(serverID string) { | ||
| if _, exists := l.serverStartTimes[serverID]; !exists { | ||
| l.serverStartTimes[serverID] = time.Now() | ||
| delete(l.serverErrors, serverID) | ||
| logLauncher.Printf("Recorded server start: serverID=%s", serverID) | ||
| } | ||
| } | ||
|
|
||
| // recordError records a server launch failure. | ||
| func (l *Launcher) recordError(serverID string, errMsg string) { | ||
| l.serverErrors[serverID] = errMsg | ||
| logLauncher.Printf("Recorded server error: serverID=%s, error=%s", serverID, errMsg) | ||
| } | ||
|
|
||
| // GetServerState returns the observed runtime state for a single server. | ||
| func (l *Launcher) GetServerState(serverID string) ServerState { | ||
| l.mu.RLock() | ||
| defer l.mu.RUnlock() | ||
|
|
||
| if errMsg, hasErr := l.serverErrors[serverID]; hasErr { | ||
| return ServerState{ | ||
| Status: "error", | ||
| LastError: errMsg, | ||
| } | ||
| } | ||
|
|
||
| if startedAt, ok := l.serverStartTimes[serverID]; ok { | ||
| return ServerState{ | ||
| Status: "running", | ||
| StartedAt: startedAt, | ||
| } | ||
| } | ||
|
|
||
| return ServerState{Status: "stopped"} | ||
| } | ||
|
Comment on lines
+325
to
+345
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -773,3 +773,103 @@ func TestLauncher_MixedAuthTypes(t *testing.T) { | |||||||||||||||||||||||||
| assert.NotNil(t, l, "Launcher should be created even with OIDC servers and missing env vars") | ||||||||||||||||||||||||||
| assert.Nil(t, l.oidcProvider, "OIDC provider should be nil without env vars") | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| func TestGetServerState_StoppedByDefault(t *testing.T) { | ||||||||||||||||||||||||||
| cfg := newTestConfig(map[string]*config.ServerConfig{ | ||||||||||||||||||||||||||
| "test-server": {Type: "stdio", Command: "echo"}, | ||||||||||||||||||||||||||
| }) | ||||||||||||||||||||||||||
| l := New(context.Background(), cfg) | ||||||||||||||||||||||||||
| defer l.Close() | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| state := l.GetServerState("test-server") | ||||||||||||||||||||||||||
| assert.Equal(t, "stopped", state.Status) | ||||||||||||||||||||||||||
| assert.True(t, state.StartedAt.IsZero()) | ||||||||||||||||||||||||||
| assert.Empty(t, state.LastError) | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| func TestGetServerState_UnknownServer(t *testing.T) { | ||||||||||||||||||||||||||
| cfg := newTestConfig(map[string]*config.ServerConfig{}) | ||||||||||||||||||||||||||
| l := New(context.Background(), cfg) | ||||||||||||||||||||||||||
| defer l.Close() | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| state := l.GetServerState("nonexistent") | ||||||||||||||||||||||||||
| assert.Equal(t, "stopped", state.Status) | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| func TestRecordStart_SetsRunningState(t *testing.T) { | ||||||||||||||||||||||||||
| cfg := newTestConfig(map[string]*config.ServerConfig{ | ||||||||||||||||||||||||||
| "test-server": {Type: "stdio", Command: "echo"}, | ||||||||||||||||||||||||||
| }) | ||||||||||||||||||||||||||
| l := New(context.Background(), cfg) | ||||||||||||||||||||||||||
| defer l.Close() | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| l.recordStart("test-server") | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| state := l.GetServerState("test-server") | ||||||||||||||||||||||||||
| assert.Equal(t, "running", state.Status) | ||||||||||||||||||||||||||
| assert.False(t, state.StartedAt.IsZero()) | ||||||||||||||||||||||||||
| assert.Empty(t, state.LastError) | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| func TestRecordStart_OnlyRecordsFirstLaunch(t *testing.T) { | ||||||||||||||||||||||||||
| cfg := newTestConfig(map[string]*config.ServerConfig{ | ||||||||||||||||||||||||||
| "test-server": {Type: "stdio", Command: "echo"}, | ||||||||||||||||||||||||||
| }) | ||||||||||||||||||||||||||
| l := New(context.Background(), cfg) | ||||||||||||||||||||||||||
| defer l.Close() | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| l.recordStart("test-server") | ||||||||||||||||||||||||||
| firstStart := l.serverStartTimes["test-server"] | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| // Second call should be a no-op | ||||||||||||||||||||||||||
| l.recordStart("test-server") | ||||||||||||||||||||||||||
| secondStart := l.serverStartTimes["test-server"] | ||||||||||||||||||||||||||
|
Comment on lines
+822
to
+826
|
||||||||||||||||||||||||||
| firstStart := l.serverStartTimes["test-server"] | |
| // Second call should be a no-op | |
| l.recordStart("test-server") | |
| secondStart := l.serverStartTimes["test-server"] | |
| firstState := l.GetServerState("test-server") | |
| firstStart := firstState.StartedAt | |
| // Second call should be a no-op | |
| l.recordStart("test-server") | |
| secondState := l.GetServerState("test-server") | |
| secondStart := secondState.StartedAt |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -465,3 +465,45 @@ func TestBuildHealthResponse_HealthyStatus(t *testing.T) { | |
| assert.NotEmpty(response.GatewayVersion, "gatewayVersion should not be empty") | ||
| assert.NotNil(response.Servers, "Servers map should not be nil") | ||
| } | ||
|
|
||
| // TestGetServerStatus_ErrorOnFailedLaunch verifies servers report "error" when launch fails | ||
| func TestGetServerStatus_ErrorOnFailedLaunch(t *testing.T) { | ||
| assert := assert.New(t) | ||
| require := require.New(t) | ||
|
|
||
| cfg := &config.Config{ | ||
| Servers: map[string]*config.ServerConfig{ | ||
| "server-a": {Type: "stdio", Command: "docker", Args: []string{"run", "test"}}, | ||
| }, | ||
| } | ||
|
|
||
| ctx := context.Background() | ||
| us, err := NewUnified(ctx, cfg) | ||
| require.NoError(err) | ||
| t.Cleanup(func() { us.Close() }) | ||
|
|
||
| status := us.GetServerStatus() | ||
| require.Contains(status, "server-a") | ||
| assert.Equal("error", status["server-a"].Status, "Server should report 'error' after failed launch") | ||
| assert.Equal(0, status["server-a"].Uptime, "Uptime should be 0 for errored server") | ||
| } | ||
|
|
||
| // TestBuildHealthResponse_UnhealthyOnError verifies that failed servers make the gateway unhealthy | ||
| func TestBuildHealthResponse_UnhealthyOnError(t *testing.T) { | ||
| assert := assert.New(t) | ||
| require := require.New(t) | ||
|
|
||
| cfg := &config.Config{ | ||
| Servers: map[string]*config.ServerConfig{ | ||
| "server-a": {Type: "stdio", Command: "docker", Args: []string{"run", "test"}}, | ||
| }, | ||
| } | ||
|
|
||
| ctx := context.Background() | ||
| us, err := NewUnified(ctx, cfg) | ||
| require.NoError(err) | ||
| t.Cleanup(func() { us.Close() }) | ||
|
|
||
| response := BuildHealthResponse(us) | ||
| assert.Equal("unhealthy", response.Status, "Gateway should be unhealthy when a server has errors") | ||
| } | ||
|
Comment on lines
+474
to
+509
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
recordError logs the raw errMsg into logs. Since errMsg can include untrusted/secret-bearing content (e.g., backend stderr or URLs with tokens), this can leak sensitive data into launcher logs. Sanitize/truncate the error string before logging (or log a stable error code/type and keep the full error only in memory).