From dafc495754c8331cdbc0354c8f0a38dfc4a65e5e Mon Sep 17 00:00:00 2001 From: Copilot Date: Mon, 23 Mar 2026 11:22:20 +0200 Subject: [PATCH] feat(capabilities): dual-mode deployment support (agent-per-node + squad-per-pod) Adds SQUAD_POD_ID and SQUAD_DEPLOYMENT_MODE env vars for pod-specific capability routing. Pod-specific manifests override shared manifests. Closes #514 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .changeset/dual-mode-capabilities.md | 12 ++ packages/squad-sdk/src/ralph/capabilities.ts | 56 +++++++- packages/squad-sdk/src/ralph/index.ts | 2 +- templates/machine-capabilities.md | 48 ++++++- test/capabilities.test.ts | 137 ++++++++++++++++++- 5 files changed, 250 insertions(+), 5 deletions(-) create mode 100644 .changeset/dual-mode-capabilities.md diff --git a/.changeset/dual-mode-capabilities.md b/.changeset/dual-mode-capabilities.md new file mode 100644 index 000000000..497aeda5c --- /dev/null +++ b/.changeset/dual-mode-capabilities.md @@ -0,0 +1,12 @@ +--- +'@bradygaster/squad-sdk': minor +--- + +Add dual-mode deployment support for capabilities routing. + +New features: +- `SQUAD_POD_ID` env var for pod-specific capability manifests +- `SQUAD_DEPLOYMENT_MODE` env var (`agent-per-node` | `squad-per-pod`) +- Pod-specific manifest loading: `.squad/machine-capabilities-{podId}.json` +- Fallback chain: pod-specific → shared → user-home → null (opt-in) +- New exports: `getDeploymentMode()`, `getPodId()`, `DeploymentMode` type diff --git a/packages/squad-sdk/src/ralph/capabilities.ts b/packages/squad-sdk/src/ralph/capabilities.ts index 8a00223e8..173e33c22 100644 --- a/packages/squad-sdk/src/ralph/capabilities.ts +++ b/packages/squad-sdk/src/ralph/capabilities.ts @@ -13,12 +13,17 @@ import { existsSync } from 'node:fs'; import path from 'node:path'; import os from 'node:os'; +/** Deployment mode for capability routing */ +export type DeploymentMode = 'agent-per-node' | 'squad-per-pod'; + /** Machine capability manifest */ export interface MachineCapabilities { machine: string; capabilities: string[]; missing: string[]; lastUpdated: string; + /** Pod identifier when running in squad-per-pod mode */ + podId?: string; } /** Well-known capability identifiers */ @@ -38,9 +43,45 @@ export type KnownCapability = typeof KNOWN_CAPABILITIES[number]; /** Prefix for capability requirement labels */ const NEEDS_PREFIX = 'needs:'; +/** + * Get the deployment mode from the `SQUAD_DEPLOYMENT_MODE` env var. + * Defaults to `'agent-per-node'` when unset. + */ +export function getDeploymentMode(): DeploymentMode { + const raw = process.env.SQUAD_DEPLOYMENT_MODE; + if (raw === 'squad-per-pod') return 'squad-per-pod'; + return 'agent-per-node'; +} + +/** + * Get the pod identifier from the `SQUAD_POD_ID` env var. + * Returns `undefined` when unset. + */ +export function getPodId(): string | undefined { + return process.env.SQUAD_POD_ID || undefined; +} + +/** + * Build the path for a pod-specific capabilities manifest. + * + * @example + * generatePodCapabilitiesPath('/app', 'squad-worker-7b4f6') + * // → '/app/.squad/machine-capabilities-squad-worker-7b4f6.json' + */ +export function generatePodCapabilitiesPath(teamRoot: string, podId: string): string { + return path.join(teamRoot, '.squad', `machine-capabilities-${podId}.json`); +} + /** * Load machine capabilities from the standard location. - * Checks (in order): + * + * When `SQUAD_POD_ID` is set **and** `SQUAD_DEPLOYMENT_MODE` is + * `squad-per-pod`, the search order becomes: + * 1. `.squad/machine-capabilities-{podId}.json` (pod-specific) + * 2. `.squad/machine-capabilities.json` (shared fallback) + * 3. `~/.squad/machine-capabilities.json` (user home fallback) + * + * Otherwise (default `agent-per-node` mode): * 1. `.squad/machine-capabilities.json` in the team root * 2. `~/.squad/machine-capabilities.json` in the user home * @@ -50,8 +91,14 @@ export async function loadCapabilities( teamRoot?: string ): Promise { const candidates: string[] = []; + const mode = getDeploymentMode(); + const podId = getPodId(); if (teamRoot) { + // In squad-per-pod mode, try pod-specific manifest first + if (mode === 'squad-per-pod' && podId) { + candidates.push(generatePodCapabilitiesPath(teamRoot, podId)); + } candidates.push(path.join(teamRoot, '.squad', 'machine-capabilities.json')); } candidates.push(path.join(os.homedir(), '.squad', 'machine-capabilities.json')); @@ -60,7 +107,12 @@ export async function loadCapabilities( if (existsSync(candidate)) { try { const raw = await readFile(candidate, 'utf8'); - return JSON.parse(raw) as MachineCapabilities; + const parsed = JSON.parse(raw) as MachineCapabilities; + // Stamp podId onto the loaded manifest when running in pod mode + if (mode === 'squad-per-pod' && podId) { + parsed.podId = parsed.podId ?? podId; + } + return parsed; } catch { // Malformed file — skip } diff --git a/packages/squad-sdk/src/ralph/index.ts b/packages/squad-sdk/src/ralph/index.ts index 4a773f23c..36afd67a3 100644 --- a/packages/squad-sdk/src/ralph/index.ts +++ b/packages/squad-sdk/src/ralph/index.ts @@ -184,5 +184,5 @@ export class RalphMonitor { } } -export { loadCapabilities, canHandleIssue, filterByCapabilities, extractNeeds, type MachineCapabilities, KNOWN_CAPABILITIES } from './capabilities.js'; +export { loadCapabilities, canHandleIssue, filterByCapabilities, extractNeeds, getDeploymentMode, getPodId, generatePodCapabilitiesPath, type MachineCapabilities, type DeploymentMode, KNOWN_CAPABILITIES } from './capabilities.js'; export { getTrafficLight, shouldProceed, getRetryDelay, PredictiveCircuitBreaker, canUseQuota, loadRatePool, type RatePool, type RatePoolAllocation, type RateSample, type TrafficLight, type AgentPriority } from './rate-limiting.js'; diff --git a/templates/machine-capabilities.md b/templates/machine-capabilities.md index b770fd04b..fd709643c 100644 --- a/templates/machine-capabilities.md +++ b/templates/machine-capabilities.md @@ -59,7 +59,11 @@ Ralph will log skipped issues: ## Kubernetes Integration -On Kubernetes, machine capabilities map to node labels: +Machine capabilities support two deployment modes on Kubernetes: + +### Mode A — Agent-per-node (default) + +One Ralph process per Kubernetes node. Each reads the node-local `machine-capabilities.json`. Use `nodeSelector` to pin Ralphs to nodes with the right hardware. ```yaml # Node labels (set by capability DaemonSet or manually) @@ -72,4 +76,46 @@ spec: node.squad.dev/gpu: "true" ``` +No extra environment variables needed — this is the default mode. + +### Mode B — Squad-per-pod + +Multiple full Squad instances run as separate pods (on the same or different nodes). Each pod gets its own identity via the `SQUAD_POD_ID` environment variable, which enables pod-specific capability manifests. + +```yaml +# Deployment spec for squad-per-pod mode +spec: + replicas: 3 + template: + spec: + containers: + - name: squad + env: + - name: SQUAD_POD_ID + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SQUAD_DEPLOYMENT_MODE + value: squad-per-pod +``` + +When `SQUAD_POD_ID` is set and `SQUAD_DEPLOYMENT_MODE` is `squad-per-pod`, Ralph looks for a pod-specific manifest first: + +1. `.squad/machine-capabilities-{podId}.json` (pod-specific) +2. `.squad/machine-capabilities.json` (shared fallback) +3. `~/.squad/machine-capabilities.json` (user home fallback) +4. `null` (opt-in — all issues pass through) + +Example pod-specific manifest (`.squad/machine-capabilities-squad-worker-7b4f6.json`): + +```json +{ + "machine": "squad-worker-7b4f6", + "capabilities": ["gpu", "docker", "azure-cli"], + "missing": ["browser", "onedrive"], + "lastUpdated": "2026-06-01T00:00:00Z", + "podId": "squad-worker-7b4f6" +} +``` + A DaemonSet can run capability discovery on each node and maintain labels automatically. See the [squad-on-aks](https://github.com/tamirdresher/squad-on-aks) project for a complete Kubernetes deployment example. \ No newline at end of file diff --git a/test/capabilities.test.ts b/test/capabilities.test.ts index dd4cf9627..8bd2b6248 100644 --- a/test/capabilities.test.ts +++ b/test/capabilities.test.ts @@ -1,10 +1,17 @@ -import { describe, it, expect } from 'vitest'; +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; import { extractNeeds, canHandleIssue, filterByCapabilities, + loadCapabilities, + getDeploymentMode, + getPodId, + generatePodCapabilitiesPath, type MachineCapabilities, } from '@bradygaster/squad-sdk/ralph/capabilities'; +import { existsSync, mkdirSync, writeFileSync, rmSync } from 'node:fs'; +import path from 'node:path'; +import os from 'node:os'; const gpuMachine: MachineCapabilities = { machine: 'GPU-SERVER', @@ -104,4 +111,132 @@ describe('filterByCapabilities', () => { expect(handled).toHaveLength(0); expect(skipped).toHaveLength(0); }); +}); + +describe('dual-mode deployment', () => { + let savedPodId: string | undefined; + let savedMode: string | undefined; + let tmpDir: string; + + beforeEach(() => { + savedPodId = process.env.SQUAD_POD_ID; + savedMode = process.env.SQUAD_DEPLOYMENT_MODE; + delete process.env.SQUAD_POD_ID; + delete process.env.SQUAD_DEPLOYMENT_MODE; + + tmpDir = path.join(os.tmpdir(), `squad-cap-test-${Date.now()}-${Math.random().toString(36).slice(2)}`); + mkdirSync(path.join(tmpDir, '.squad'), { recursive: true }); + }); + + afterEach(() => { + if (savedPodId !== undefined) process.env.SQUAD_POD_ID = savedPodId; + else delete process.env.SQUAD_POD_ID; + if (savedMode !== undefined) process.env.SQUAD_DEPLOYMENT_MODE = savedMode; + else delete process.env.SQUAD_DEPLOYMENT_MODE; + + try { rmSync(tmpDir, { recursive: true, force: true }); } catch { /* ignore */ } + }); + + it('loadCapabilities reads pod-specific manifest when SQUAD_POD_ID is set', async () => { + process.env.SQUAD_POD_ID = 'squad-worker-abc'; + process.env.SQUAD_DEPLOYMENT_MODE = 'squad-per-pod'; + + const podManifest: MachineCapabilities = { + machine: 'POD-ABC', + capabilities: ['gpu', 'docker'], + missing: [], + lastUpdated: '2026-06-01T00:00:00Z', + }; + writeFileSync( + path.join(tmpDir, '.squad', 'machine-capabilities-squad-worker-abc.json'), + JSON.stringify(podManifest), + ); + // Also write shared manifest to ensure pod-specific wins + const sharedManifest: MachineCapabilities = { + machine: 'SHARED', + capabilities: ['browser'], + missing: ['gpu'], + lastUpdated: '2026-06-01T00:00:00Z', + }; + writeFileSync( + path.join(tmpDir, '.squad', 'machine-capabilities.json'), + JSON.stringify(sharedManifest), + ); + + const caps = await loadCapabilities(tmpDir); + expect(caps).not.toBeNull(); + expect(caps!.machine).toBe('POD-ABC'); + expect(caps!.podId).toBe('squad-worker-abc'); + }); + + it('loadCapabilities falls back to shared manifest when pod-specific not found', async () => { + process.env.SQUAD_POD_ID = 'squad-worker-xyz'; + process.env.SQUAD_DEPLOYMENT_MODE = 'squad-per-pod'; + + const sharedManifest: MachineCapabilities = { + machine: 'SHARED-FALLBACK', + capabilities: ['browser'], + missing: ['gpu'], + lastUpdated: '2026-06-01T00:00:00Z', + }; + writeFileSync( + path.join(tmpDir, '.squad', 'machine-capabilities.json'), + JSON.stringify(sharedManifest), + ); + + const caps = await loadCapabilities(tmpDir); + expect(caps).not.toBeNull(); + expect(caps!.machine).toBe('SHARED-FALLBACK'); + expect(caps!.podId).toBe('squad-worker-xyz'); + }); + + it('loadCapabilities ignores SQUAD_POD_ID when SQUAD_DEPLOYMENT_MODE is agent-per-node', async () => { + process.env.SQUAD_POD_ID = 'squad-worker-abc'; + process.env.SQUAD_DEPLOYMENT_MODE = 'agent-per-node'; + + const podManifest: MachineCapabilities = { + machine: 'POD-ABC', + capabilities: ['gpu', 'docker'], + missing: [], + lastUpdated: '2026-06-01T00:00:00Z', + }; + writeFileSync( + path.join(tmpDir, '.squad', 'machine-capabilities-squad-worker-abc.json'), + JSON.stringify(podManifest), + ); + const sharedManifest: MachineCapabilities = { + machine: 'SHARED', + capabilities: ['browser'], + missing: ['gpu'], + lastUpdated: '2026-06-01T00:00:00Z', + }; + writeFileSync( + path.join(tmpDir, '.squad', 'machine-capabilities.json'), + JSON.stringify(sharedManifest), + ); + + const caps = await loadCapabilities(tmpDir); + expect(caps).not.toBeNull(); + // Should read shared, not pod-specific, because mode is agent-per-node + expect(caps!.machine).toBe('SHARED'); + expect(caps!.podId).toBeUndefined(); + }); + + it('getDeploymentMode defaults to agent-per-node', () => { + delete process.env.SQUAD_DEPLOYMENT_MODE; + expect(getDeploymentMode()).toBe('agent-per-node'); + }); + + it('getDeploymentMode reads SQUAD_DEPLOYMENT_MODE env var', () => { + process.env.SQUAD_DEPLOYMENT_MODE = 'squad-per-pod'; + expect(getDeploymentMode()).toBe('squad-per-pod'); + }); + + it('getPodId reads SQUAD_POD_ID env var', () => { + delete process.env.SQUAD_POD_ID; + expect(getPodId()).toBeUndefined(); + + process.env.SQUAD_POD_ID = 'my-pod-42'; + expect(getPodId()).toBe('my-pod-42'); + }); }); \ No newline at end of file