Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .changeset/dual-mode-capabilities.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
'@bradygaster/squad-sdk': minor
---

Add dual-mode deployment support for capabilities routing.

New features:
- `SQUAD_POD_ID` env var for pod-specific capability manifests
- `SQUAD_DEPLOYMENT_MODE` env var (`agent-per-node` | `squad-per-pod`)
- Pod-specific manifest loading: `.squad/machine-capabilities-{podId}.json`
- Fallback chain: pod-specific → shared → user-home → null (opt-in)
- New exports: `getDeploymentMode()`, `getPodId()`, `DeploymentMode` type
56 changes: 54 additions & 2 deletions packages/squad-sdk/src/ralph/capabilities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,17 @@ import { existsSync } from 'node:fs';
import path from 'node:path';
import os from 'node:os';

/** Deployment mode for capability routing */
export type DeploymentMode = 'agent-per-node' | 'squad-per-pod';

/** Machine capability manifest */
export interface MachineCapabilities {
machine: string;
capabilities: string[];
missing: string[];
lastUpdated: string;
/** Pod identifier when running in squad-per-pod mode */
podId?: string;
}

/** Well-known capability identifiers */
Expand All @@ -38,9 +43,45 @@ export type KnownCapability = typeof KNOWN_CAPABILITIES[number];
/** Prefix for capability requirement labels */
const NEEDS_PREFIX = 'needs:';

/**
* Get the deployment mode from the `SQUAD_DEPLOYMENT_MODE` env var.
* Defaults to `'agent-per-node'` when unset.
*/
export function getDeploymentMode(): DeploymentMode {
const raw = process.env.SQUAD_DEPLOYMENT_MODE;
if (raw === 'squad-per-pod') return 'squad-per-pod';
return 'agent-per-node';
}

/**
* Get the pod identifier from the `SQUAD_POD_ID` env var.
* Returns `undefined` when unset.
*/
export function getPodId(): string | undefined {
return process.env.SQUAD_POD_ID || undefined;
}

/**
* Build the path for a pod-specific capabilities manifest.
*
* @example
* generatePodCapabilitiesPath('/app', 'squad-worker-7b4f6')
* // → '/app/.squad/machine-capabilities-squad-worker-7b4f6.json'
*/
export function generatePodCapabilitiesPath(teamRoot: string, podId: string): string {
return path.join(teamRoot, '.squad', `machine-capabilities-${podId}.json`);
}

/**
* Load machine capabilities from the standard location.
* Checks (in order):
*
* When `SQUAD_POD_ID` is set **and** `SQUAD_DEPLOYMENT_MODE` is
* `squad-per-pod`, the search order becomes:
* 1. `.squad/machine-capabilities-{podId}.json` (pod-specific)
* 2. `.squad/machine-capabilities.json` (shared fallback)
* 3. `~/.squad/machine-capabilities.json` (user home fallback)
*
* Otherwise (default `agent-per-node` mode):
* 1. `.squad/machine-capabilities.json` in the team root
* 2. `~/.squad/machine-capabilities.json` in the user home
*
Expand All @@ -50,8 +91,14 @@ export async function loadCapabilities(
teamRoot?: string
): Promise<MachineCapabilities | null> {
const candidates: string[] = [];
const mode = getDeploymentMode();
const podId = getPodId();

if (teamRoot) {
// In squad-per-pod mode, try pod-specific manifest first
if (mode === 'squad-per-pod' && podId) {
candidates.push(generatePodCapabilitiesPath(teamRoot, podId));
}
candidates.push(path.join(teamRoot, '.squad', 'machine-capabilities.json'));
}
candidates.push(path.join(os.homedir(), '.squad', 'machine-capabilities.json'));
Expand All @@ -60,7 +107,12 @@ export async function loadCapabilities(
if (existsSync(candidate)) {
try {
const raw = await readFile(candidate, 'utf8');
return JSON.parse(raw) as MachineCapabilities;
const parsed = JSON.parse(raw) as MachineCapabilities;
// Stamp podId onto the loaded manifest when running in pod mode
if (mode === 'squad-per-pod' && podId) {
parsed.podId = parsed.podId ?? podId;
}
return parsed;
} catch {
// Malformed file — skip
}
Expand Down
2 changes: 1 addition & 1 deletion packages/squad-sdk/src/ralph/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -184,5 +184,5 @@ export class RalphMonitor {
}
}

export { loadCapabilities, canHandleIssue, filterByCapabilities, extractNeeds, type MachineCapabilities, KNOWN_CAPABILITIES } from './capabilities.js';
export { loadCapabilities, canHandleIssue, filterByCapabilities, extractNeeds, getDeploymentMode, getPodId, generatePodCapabilitiesPath, type MachineCapabilities, type DeploymentMode, KNOWN_CAPABILITIES } from './capabilities.js';
export { getTrafficLight, shouldProceed, getRetryDelay, PredictiveCircuitBreaker, canUseQuota, loadRatePool, type RatePool, type RatePoolAllocation, type RateSample, type TrafficLight, type AgentPriority } from './rate-limiting.js';
48 changes: 47 additions & 1 deletion templates/machine-capabilities.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,11 @@ Ralph will log skipped issues:

## Kubernetes Integration

On Kubernetes, machine capabilities map to node labels:
Machine capabilities support two deployment modes on Kubernetes:

### Mode A — Agent-per-node (default)

One Ralph process per Kubernetes node. Each reads the node-local `machine-capabilities.json`. Use `nodeSelector` to pin Ralphs to nodes with the right hardware.

```yaml
# Node labels (set by capability DaemonSet or manually)
Expand All @@ -72,4 +76,46 @@ spec:
node.squad.dev/gpu: "true"
```

No extra environment variables needed — this is the default mode.

### Mode B — Squad-per-pod

Multiple full Squad instances run as separate pods (on the same or different nodes). Each pod gets its own identity via the `SQUAD_POD_ID` environment variable, which enables pod-specific capability manifests.

```yaml
# Deployment spec for squad-per-pod mode
spec:
replicas: 3
template:
spec:
containers:
- name: squad
env:
- name: SQUAD_POD_ID
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: SQUAD_DEPLOYMENT_MODE
value: squad-per-pod
```

When `SQUAD_POD_ID` is set and `SQUAD_DEPLOYMENT_MODE` is `squad-per-pod`, Ralph looks for a pod-specific manifest first:

1. `.squad/machine-capabilities-{podId}.json` (pod-specific)
2. `.squad/machine-capabilities.json` (shared fallback)
3. `~/.squad/machine-capabilities.json` (user home fallback)
4. `null` (opt-in — all issues pass through)

Example pod-specific manifest (`.squad/machine-capabilities-squad-worker-7b4f6.json`):

```json
{
"machine": "squad-worker-7b4f6",
"capabilities": ["gpu", "docker", "azure-cli"],
"missing": ["browser", "onedrive"],
"lastUpdated": "2026-06-01T00:00:00Z",
"podId": "squad-worker-7b4f6"
}
```

A DaemonSet can run capability discovery on each node and maintain labels automatically. See the [squad-on-aks](https://github.com/tamirdresher/squad-on-aks) project for a complete Kubernetes deployment example.
137 changes: 136 additions & 1 deletion test/capabilities.test.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
import { describe, it, expect } from 'vitest';
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
import {
extractNeeds,
canHandleIssue,
filterByCapabilities,
loadCapabilities,
getDeploymentMode,
getPodId,
generatePodCapabilitiesPath,
type MachineCapabilities,
} from '@bradygaster/squad-sdk/ralph/capabilities';
import { existsSync, mkdirSync, writeFileSync, rmSync } from 'node:fs';
import path from 'node:path';
import os from 'node:os';

const gpuMachine: MachineCapabilities = {
machine: 'GPU-SERVER',
Expand Down Expand Up @@ -104,4 +111,132 @@ describe('filterByCapabilities', () => {
expect(handled).toHaveLength(0);
expect(skipped).toHaveLength(0);
});
});

describe('dual-mode deployment', () => {
let savedPodId: string | undefined;
let savedMode: string | undefined;
let tmpDir: string;

beforeEach(() => {
savedPodId = process.env.SQUAD_POD_ID;
savedMode = process.env.SQUAD_DEPLOYMENT_MODE;
delete process.env.SQUAD_POD_ID;
delete process.env.SQUAD_DEPLOYMENT_MODE;

tmpDir = path.join(os.tmpdir(), `squad-cap-test-${Date.now()}-${Math.random().toString(36).slice(2)}`);
mkdirSync(path.join(tmpDir, '.squad'), { recursive: true });
});

afterEach(() => {
if (savedPodId !== undefined) process.env.SQUAD_POD_ID = savedPodId;
else delete process.env.SQUAD_POD_ID;
if (savedMode !== undefined) process.env.SQUAD_DEPLOYMENT_MODE = savedMode;
else delete process.env.SQUAD_DEPLOYMENT_MODE;

try { rmSync(tmpDir, { recursive: true, force: true }); } catch { /* ignore */ }
});

it('loadCapabilities reads pod-specific manifest when SQUAD_POD_ID is set', async () => {
process.env.SQUAD_POD_ID = 'squad-worker-abc';
process.env.SQUAD_DEPLOYMENT_MODE = 'squad-per-pod';

const podManifest: MachineCapabilities = {
machine: 'POD-ABC',
capabilities: ['gpu', 'docker'],
missing: [],
lastUpdated: '2026-06-01T00:00:00Z',
};
writeFileSync(
path.join(tmpDir, '.squad', 'machine-capabilities-squad-worker-abc.json'),
JSON.stringify(podManifest),
);
// Also write shared manifest to ensure pod-specific wins
const sharedManifest: MachineCapabilities = {
machine: 'SHARED',
capabilities: ['browser'],
missing: ['gpu'],
lastUpdated: '2026-06-01T00:00:00Z',
};
writeFileSync(
path.join(tmpDir, '.squad', 'machine-capabilities.json'),
JSON.stringify(sharedManifest),
);

const caps = await loadCapabilities(tmpDir);
expect(caps).not.toBeNull();
expect(caps!.machine).toBe('POD-ABC');
expect(caps!.podId).toBe('squad-worker-abc');
});

it('loadCapabilities falls back to shared manifest when pod-specific not found', async () => {
process.env.SQUAD_POD_ID = 'squad-worker-xyz';
process.env.SQUAD_DEPLOYMENT_MODE = 'squad-per-pod';

const sharedManifest: MachineCapabilities = {
machine: 'SHARED-FALLBACK',
capabilities: ['browser'],
missing: ['gpu'],
lastUpdated: '2026-06-01T00:00:00Z',
};
writeFileSync(
path.join(tmpDir, '.squad', 'machine-capabilities.json'),
JSON.stringify(sharedManifest),
);

const caps = await loadCapabilities(tmpDir);
expect(caps).not.toBeNull();
expect(caps!.machine).toBe('SHARED-FALLBACK');
expect(caps!.podId).toBe('squad-worker-xyz');
});

it('loadCapabilities ignores SQUAD_POD_ID when SQUAD_DEPLOYMENT_MODE is agent-per-node', async () => {
process.env.SQUAD_POD_ID = 'squad-worker-abc';
process.env.SQUAD_DEPLOYMENT_MODE = 'agent-per-node';

const podManifest: MachineCapabilities = {
machine: 'POD-ABC',
capabilities: ['gpu', 'docker'],
missing: [],
lastUpdated: '2026-06-01T00:00:00Z',
};
writeFileSync(
path.join(tmpDir, '.squad', 'machine-capabilities-squad-worker-abc.json'),
JSON.stringify(podManifest),
);
const sharedManifest: MachineCapabilities = {
machine: 'SHARED',
capabilities: ['browser'],
missing: ['gpu'],
lastUpdated: '2026-06-01T00:00:00Z',
};
writeFileSync(
path.join(tmpDir, '.squad', 'machine-capabilities.json'),
JSON.stringify(sharedManifest),
);

const caps = await loadCapabilities(tmpDir);
expect(caps).not.toBeNull();
// Should read shared, not pod-specific, because mode is agent-per-node
expect(caps!.machine).toBe('SHARED');
expect(caps!.podId).toBeUndefined();
});

it('getDeploymentMode defaults to agent-per-node', () => {
delete process.env.SQUAD_DEPLOYMENT_MODE;
expect(getDeploymentMode()).toBe('agent-per-node');
});

it('getDeploymentMode reads SQUAD_DEPLOYMENT_MODE env var', () => {
process.env.SQUAD_DEPLOYMENT_MODE = 'squad-per-pod';
expect(getDeploymentMode()).toBe('squad-per-pod');
});

it('getPodId reads SQUAD_POD_ID env var', () => {
delete process.env.SQUAD_POD_ID;
expect(getPodId()).toBeUndefined();

process.env.SQUAD_POD_ID = 'my-pod-42';
expect(getPodId()).toBe('my-pod-42');
});
});
Loading