agi-inc · JacobFV · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/MULTIMODAL_UPDATES.md b/MULTIMODAL_UPDATES.md
@@ -0,0 +1,202 @@
+# Multimodal Driver Support - Node SDK Updates
+
+This update adds comprehensive multimodal support to the Node SDK to match the new agi-driver capabilities.
+
+## Changes Made
+
+### Protocol Updates (`src/driver/protocol.ts`)
+
+#### New Event Types
+- `AudioTranscriptEvent`: Audio transcript from buffer
+- `VideoFrameEvent`: Video frame from camera/screen
+- `SpeechStartedEvent`: TTS playback started
+- `SpeechFinishedEvent`: TTS playback finished
+- `TurnDetectedEvent`: Voice turn detection
+
+#### New Command Types
+- `GetAudioTranscriptCommand`: Request audio transcript
+- `GetVideoFrameCommand`: Request video frame
+
+#### New Interfaces
+- `MCPServerConfig`: MCP server configuration
+- `AgentIdentity`: Agent identity information
+- `ToolChoice`: Tool choice configuration type
+
+#### Updated StartCommand
+Added fields for multimodal configuration:
+- `agent_identity?: AgentIdentity` - Agent identity (default: agi-2-claude by AGI Company)
+- `tool_choice?: ToolChoice` - Tool choice mode
+- `mcp_servers?: MCPServerConfig[]` - MCP server configurations
+- `audio_input_enabled?: boolean`, `audio_buffer_seconds?: number`
+- `turn_detection_enabled?: boolean`, `turn_detection_silence_ms?: number`
+- `speech_output_enabled?: boolean`, `speech_voice?: string`
+- `camera_enabled?: boolean`, `camera_buffer_seconds?: number`
+- `screen_recording_enabled?: boolean`, `screen_recording_buffer_seconds?: number`
+
+### Exports (`src/driver/index.ts`)
+Added exports for all new event and command types, plus helper interfaces.
+
+## Usage Examples
+
+### Basic Multimodal Session
+
+```typescript
+import { AgentDriver } from '@agi-inc/agi-node';
+
+const driver = new AgentDriver({
+  mode: 'local',
+  agent_name: 'agi-2-claude'
+});
+
+// Start with multimodal features
+await driver.start({
+  goal: 'Help me with my computer',
+  mode: 'local',
+  agent_name: 'agi-2-claude',
+
+  // Voice features
+  audio_input_enabled: true,
+  turn_detection_enabled: true,
+  speech_output_enabled: true,
+  speech_voice: 'alloy',
+
+  // Video features
+  camera_enabled: true,
+  screen_recording_enabled: true,
+
+  // MCP servers
+  mcp_servers: [
+    {
+      name: 'filesystem',
+      command: 'npx',
+      args: ['-y', '@modelcontextprotocol/server-filesystem', '/path/to/dir'],
+      env: {}
+    }
+  ],
+
+  // Tool choice
+  tool_choice: 'auto'
+});
+```
+
+### Handling New Events
+
+```typescript
+driver.on('audio_transcript', (event: AudioTranscriptEvent) => {
+  console.log(`Transcript: ${event.transcript}`);
+});
+
+driver.on('video_frame', (event: VideoFrameEvent) => {
+  // event.frame_base64 contains JPEG frame
+  saveFrame(event.frame_base64);
+});
+
+driver.on('speech_started', (event: SpeechStartedEvent) => {
+  console.log(`🔊 Speaking: ${event.text}`);
+});
+
+driver.on('speech_finished', () => {
+  console.log('✓ Finished speaking');
+});
+
+driver.on('turn_detected', (event: TurnDetectedEvent) => {
+  console.log(`You said: ${event.transcript}`);
+});
+```
+
+### Voice-Only Mode
+
+```typescript
+await driver.start({
+  goal: '(voice input)',
+  mode: 'local',
+  audio_input_enabled: true,
+  turn_detection_enabled: true,
+  turn_detection_silence_ms: 1000,  // 1 second of silence = turn complete
+  speech_output_enabled: true,
+  speech_voice: 'alloy'  // or: echo, fable, onyx, nova, shimmer
+});
+```
+
+### MCP Servers
+
+```typescript
+const mcpServers: MCPServerConfig[] = [
+  {
+    name: 'filesystem',
+    command: 'npx',
+    args: ['-y', '@modelcontextprotocol/server-filesystem', '/Users/you/Documents']
+  },
+  {
+    name: 'database',
+    command: 'python',
+    args: ['-m', 'my_db_server'],
+    env: { DATABASE_URL: 'postgresql://...' }
+  }
+];
+
+await driver.start({
+  goal: 'Analyze my documents',
+  mode: 'local',
+  mcp_servers: mcpServers
+});
+```
+
+### Tool Choice Configuration
+
+```typescript
+// Auto (default)
+tool_choice: 'auto'
+
+// Required - must use at least one tool
+tool_choice: 'required'
+
+// None - no tool use
+tool_choice: 'none'
+
+// Specific tool
+tool_choice: { type: 'tool', name: 'filesystem__read_file' }
+```
+
+## Breaking Changes
+
+⚠️ This is a breaking change with no backwards compatibility.
+
+- `StartCommand` interface has many new optional fields
+- New event types may be emitted
+- `agent_name` should be set to `"agi-2-claude"` for new agents
+
+## Testing
+
+```bash
+# Install updated SDK
+npm install
+
+# Build TypeScript
+npm run build
+
+# Run tests
+npm test
+
+# Try a voice session
+node -e "
+const { AgentDriver } = require('./dist');
+
+(async () => {
+  const driver = new AgentDriver({ mode: 'local' });
+  const result = await driver.start({
+    goal: 'Test voice',
+    mode: 'local',
+    audio_input_enabled: true,
+    speech_output_enabled: true
+  });
+  console.log(result);
+})();
+"
+```
+
+## Related PRs
+
+- agi-api (driver): https://github.com/agi-inc/agents/pull/344
+- agi-python: https://github.com/agi-inc/agi-python/pull/8
+- agi-csharp: TBD
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@agi_inc/agi-js",
-  "version": "0.4.2",
+  "version": "0.5.0",
   "description": "Official TypeScript/JavaScript SDK for AGI.tech API",
   "main": "./dist/index.js",
   "module": "./dist/index.mjs",

diff --git a/src/driver/driver.ts b/src/driver/driver.ts
@@ -8,6 +8,9 @@
 import { spawn, ChildProcess } from 'child_process';
 import { EventEmitter } from 'events';
 import { createInterface, Interface } from 'readline';
+import { readFileSync, existsSync } from 'fs';
+import { resolve } from 'path';
+import { homedir } from 'os';
 import { findBinaryPath } from './binary';
 import {
   DriverEvent,
@@ -34,7 +37,7 @@
   /** Platform type (default: 'desktop') */
   platform?: 'desktop' | 'android';
   /** "local" for autonomous mode, "remote" for managed VM, "" for legacy SDK-driven mode */
-  mode?: string;
+  mode?: '' | 'local' | 'remote';
   /** Agent name for the AGI API (e.g., "agi-2-claude") */
   agentName?: string;
   /** AGI API base URL (default: "https://api.agi.tech") */
@@ -43,6 +46,18 @@
   environmentType?: string;
   /** Environment variables to pass to the driver process */
   env?: Record<string, string>;
+
+  // Multimodal options
+  /** Enable voice input/output */
+  voice?: boolean;
+  /** Enable camera video feed */
+  camera?: boolean;
+  /** Enable screen recording */
+  screen?: boolean;
+  /** Enable MCP servers */
+  mcp?: boolean;
+  /** Path to MCP config file */
+  mcpConfig?: string;
 }
 
 /**
@@ -90,12 +105,19 @@
   private readonly binaryPath: string;
   private readonly model: string;
   private readonly platform: 'desktop' | 'android';
-  private readonly mode: string;
+  private readonly mode: '' | 'local' | 'remote';
   private readonly agentName: string;
   private readonly apiUrl: string;
   private readonly environmentType: string;
   private readonly env: Record<string, string>;
 
+  // Multimodal options
+  private readonly voice: boolean;
+  private readonly camera: boolean;
+  private readonly screen: boolean;
+  private readonly mcp: boolean;
+  private readonly mcpConfig: string;
+
   private process: ChildProcess | null = null;
   private readline: Interface | null = null;
   private state: DriverState = 'idle';
@@ -111,6 +133,38 @@
   private pendingConfirm: ((approved: boolean, message?: string) => void) | null = null;
   private pendingAnswer: ((text: string) => void) | null = null;
 
+  /**
+   * Load MCP server configuration from file.
+   * @param configPath - Path to MCP config file (supports ~ expansion)
+   * @returns Array of MCP server configs, or undefined if file doesn't exist
+   */
+  private loadMcpConfig(configPath: string): any[] | undefined {
+    try {
+      // Expand ~ to home directory
+      const expandedPath = configPath.startsWith('~')
+        ? resolve(homedir(), configPath.slice(2))
+        : resolve(configPath);
+
+      if (!existsSync(expandedPath)) {
+        return undefined;
+      }
+
+      const content = readFileSync(expandedPath, 'utf-8');
+      const config = JSON.parse(content);
+
+      // Convert config object to array of MCPServerConfig
+      return Object.entries(config).map(([name, serverConfig]: [string, any]) => ({
+        name,
+        command: serverConfig.command,
+        args: serverConfig.args || [],
+        env: serverConfig.env || {},
+      }));
+    } catch (error) {
+      // If config loading fails, return undefined (MCP will be disabled)
+      return undefined;
+    }
+  }
+
   constructor(options: DriverOptions = {}) {
     super();
 
@@ -123,6 +177,13 @@
     this.apiUrl = options.apiUrl ?? '';
     this.environmentType = options.environmentType ?? '';
     this.env = options.env ?? {};
+
+    // Multimodal options
+    this.voice = options.voice ?? false;
+    this.camera = options.camera ?? false;
+    this.screen = options.screen ?? false;
+    this.mcp = options.mcp ?? false;
+    this.mcpConfig = options.mcpConfig ?? '~/.agi/mcp.json';
   }
 
   /**
@@ -168,7 +229,7 @@
     screenshot: string = '',
     screenWidth: number = 0,
     screenHeight: number = 0,
-    mode?: string
+    mode?: '' | 'local' | 'remote'
   ): Promise<DriverResult> {
     if (this.process) {
       throw new Error('Driver is already running');
@@ -248,6 +309,15 @@
           agent_name: this.agentName || undefined,
           api_url: this.apiUrl || undefined,
           environment_type: this.environmentType || undefined,
+
+          // Multimodal options
+          audio_input_enabled: this.voice,
+          turn_detection_enabled: this.voice,
+          speech_output_enabled: this.voice,
+          speech_voice: this.voice ? 'alloy' : undefined,
+          camera_enabled: this.camera,
+          screen_recording_enabled: this.screen,
+          mcp_servers: this.mcp ? this.loadMcpConfig(this.mcpConfig) : undefined,
         };
         this.sendCommand(startCmd);
       });
@@ -464,6 +534,26 @@
         this.emit('session_created', event);
         break;
 
+      case 'audio_transcript':
+        this.emit('audio_transcript', event);
+        break;
+
+      case 'video_frame':
+        this.emit('video_frame', event);
+        break;
+
+      case 'speech_started':
+        this.emit('speech_started', event);
+        break;
+
+      case 'speech_finished':
+        this.emit('speech_finished', event);
+        break;
+
+      case 'turn_detected':
+        this.emit('turn_detected', event);
+        break;
+
       case 'finished':
         this.handleFinished(event);
         break;