diff --git a/MULTIMODAL_UPDATES.md b/MULTIMODAL_UPDATES.md
new file mode 100644
index 0000000..8dfa90f
--- /dev/null
+++ b/MULTIMODAL_UPDATES.md
@@ -0,0 +1,202 @@
+# Multimodal Driver Support - Node SDK Updates
+
+This update adds comprehensive multimodal support to the Node SDK to match the new agi-driver capabilities.
+
+## Changes Made
+
+### Protocol Updates (`src/driver/protocol.ts`)
+
+#### New Event Types
+- `AudioTranscriptEvent`: Audio transcript from buffer
+- `VideoFrameEvent`: Video frame from camera/screen
+- `SpeechStartedEvent`: TTS playback started
+- `SpeechFinishedEvent`: TTS playback finished
+- `TurnDetectedEvent`: Voice turn detection
+
+#### New Command Types
+- `GetAudioTranscriptCommand`: Request audio transcript
+- `GetVideoFrameCommand`: Request video frame
+
+#### New Interfaces
+- `MCPServerConfig`: MCP server configuration
+- `AgentIdentity`: Agent identity information
+- `ToolChoice`: Tool choice configuration type
+
+#### Updated StartCommand
+Added fields for multimodal configuration:
+- `agent_identity?: AgentIdentity` - Agent identity (default: agi-2-claude by AGI Company)
+- `tool_choice?: ToolChoice` - Tool choice mode
+- `mcp_servers?: MCPServerConfig[]` - MCP server configurations
+- `audio_input_enabled?: boolean`, `audio_buffer_seconds?: number`
+- `turn_detection_enabled?: boolean`, `turn_detection_silence_ms?: number`
+- `speech_output_enabled?: boolean`, `speech_voice?: string`
+- `camera_enabled?: boolean`, `camera_buffer_seconds?: number`
+- `screen_recording_enabled?: boolean`, `screen_recording_buffer_seconds?: number`
+
+### Exports (`src/driver/index.ts`)
+Added exports for all new event and command types, plus helper interfaces.
+
+## Usage Examples
+
+### Basic Multimodal Session
+
+```typescript
+import { AgentDriver } from '@agi-inc/agi-node';
+
+const driver = new AgentDriver({
+  mode: 'local',
+  agent_name: 'agi-2-claude'
+});
+
+// Start with multimodal features
+await driver.start({
+  goal: 'Help me with my computer',
+  mode: 'local',
+  agent_name: 'agi-2-claude',
+
+  // Voice features
+  audio_input_enabled: true,
+  turn_detection_enabled: true,
+  speech_output_enabled: true,
+  speech_voice: 'alloy',
+
+  // Video features
+  camera_enabled: true,
+  screen_recording_enabled: true,
+
+  // MCP servers
+  mcp_servers: [
+    {
+      name: 'filesystem',
+      command: 'npx',
+      args: ['-y', '@modelcontextprotocol/server-filesystem', '/path/to/dir'],
+      env: {}
+    }
+  ],
+
+  // Tool choice
+  tool_choice: 'auto'
+});
+```
+
+### Handling New Events
+
+```typescript
+driver.on('audio_transcript', (event: AudioTranscriptEvent) => {
+  console.log(`Transcript: ${event.transcript}`);
+});
+
+driver.on('video_frame', (event: VideoFrameEvent) => {
+  // event.frame_base64 contains JPEG frame
+  saveFrame(event.frame_base64);
+});
+
+driver.on('speech_started', (event: SpeechStartedEvent) => {
+  console.log(`🔊 Speaking: ${event.text}`);
+});
+
+driver.on('speech_finished', () => {
+  console.log('✓ Finished speaking');
+});
+
+driver.on('turn_detected', (event: TurnDetectedEvent) => {
+  console.log(`You said: ${event.transcript}`);
+});
+```
+
+### Voice-Only Mode
+
+```typescript
+await driver.start({
+  goal: '(voice input)',
+  mode: 'local',
+  audio_input_enabled: true,
+  turn_detection_enabled: true,
+  turn_detection_silence_ms: 1000,  // 1 second of silence = turn complete
+  speech_output_enabled: true,
+  speech_voice: 'alloy'  // or: echo, fable, onyx, nova, shimmer
+});
+```
+
+### MCP Servers
+
+```typescript
+const mcpServers: MCPServerConfig[] = [
+  {
+    name: 'filesystem',
+    command: 'npx',
+    args: ['-y', '@modelcontextprotocol/server-filesystem', '/Users/you/Documents']
+  },
+  {
+    name: 'database',
+    command: 'python',
+    args: ['-m', 'my_db_server'],
+    env: { DATABASE_URL: 'postgresql://...' }
+  }
+];
+
+await driver.start({
+  goal: 'Analyze my documents',
+  mode: 'local',
+  mcp_servers: mcpServers
+});
+```
+
+### Tool Choice Configuration
+
+```typescript
+// Auto (default)
+tool_choice: 'auto'
+
+// Required - must use at least one tool
+tool_choice: 'required'
+
+// None - no tool use
+tool_choice: 'none'
+
+// Specific tool
+tool_choice: { type: 'tool', name: 'filesystem__read_file' }
+```
+
+## Breaking Changes
+
+⚠️ This is a breaking change with no backwards compatibility.
+
+- `StartCommand` interface has many new optional fields
+- New event types may be emitted
+- `agent_name` should be set to `"agi-2-claude"` for new agents
+
+## Testing
+
+```bash
+# Install updated SDK
+npm install
+
+# Build TypeScript
+npm run build
+
+# Run tests
+npm test
+
+# Try a voice session
+node -e "
+const { AgentDriver } = require('./dist');
+
+(async () => {
+  const driver = new AgentDriver({ mode: 'local' });
+  const result = await driver.start({
+    goal: 'Test voice',
+    mode: 'local',
+    audio_input_enabled: true,
+    speech_output_enabled: true
+  });
+  console.log(result);
+})();
+"
+```
+
+## Related PRs
+
+- agi-api (driver): https://github.com/agi-inc/agents/pull/344
+- agi-python: https://github.com/agi-inc/agi-python/pull/8
+- agi-csharp: TBD
diff --git a/package-lock.json b/package-lock.json
index 23ef233..b7a4e5d 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -33,18 +33,6 @@
         "@agi/agi-win32-x64": "0.4.0"
       }
     },
-    "node_modules/@agi/agi-darwin-arm64": {
-      "optional": true
-    },
-    "node_modules/@agi/agi-darwin-x64": {
-      "optional": true
-    },
-    "node_modules/@agi/agi-linux-x64": {
-      "optional": true
-    },
-    "node_modules/@agi/agi-win32-x64": {
-      "optional": true
-    },
     "node_modules/@ampproject/remapping": {
       "version": "2.3.0",
       "resolved": "https://registry.npmjs.org/@ampproject/remapping/-/remapping-2.3.0.tgz",
diff --git a/package.json b/package.json
index f02584c..2c3d59a 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@agi_inc/agi-js",
-  "version": "0.4.2",
+  "version": "0.5.0",
   "description": "Official TypeScript/JavaScript SDK for AGI.tech API",
   "main": "./dist/index.js",
   "module": "./dist/index.mjs",
diff --git a/src/driver/driver.ts b/src/driver/driver.ts
index 37b3ac6..950cb0a 100644
--- a/src/driver/driver.ts
+++ b/src/driver/driver.ts
@@ -8,6 +8,9 @@
 import { spawn, ChildProcess } from 'child_process';
 import { EventEmitter } from 'events';
 import { createInterface, Interface } from 'readline';
+import { readFileSync, existsSync } from 'fs';
+import { resolve } from 'path';
+import { homedir } from 'os';
 import { findBinaryPath } from './binary';
 import {
   DriverEvent,
@@ -34,7 +37,7 @@ export interface DriverOptions {
   /** Platform type (default: 'desktop') */
   platform?: 'desktop' | 'android';
   /** "local" for autonomous mode, "remote" for managed VM, "" for legacy SDK-driven mode */
-  mode?: string;
+  mode?: '' | 'local' | 'remote';
   /** Agent name for the AGI API (e.g., "agi-2-claude") */
   agentName?: string;
   /** AGI API base URL (default: "https://api.agi.tech") */
@@ -43,6 +46,18 @@ export interface DriverOptions {
   environmentType?: string;
   /** Environment variables to pass to the driver process */
   env?: Record<string, string>;
+
+  // Multimodal options
+  /** Enable voice input/output */
+  voice?: boolean;
+  /** Enable camera video feed */
+  camera?: boolean;
+  /** Enable screen recording */
+  screen?: boolean;
+  /** Enable MCP servers */
+  mcp?: boolean;
+  /** Path to MCP config file */
+  mcpConfig?: string;
 }
 
 /**
@@ -90,12 +105,19 @@ export class AgentDriver extends EventEmitter {
   private readonly binaryPath: string;
   private readonly model: string;
   private readonly platform: 'desktop' | 'android';
-  private readonly mode: string;
+  private readonly mode: '' | 'local' | 'remote';
   private readonly agentName: string;
   private readonly apiUrl: string;
   private readonly environmentType: string;
   private readonly env: Record<string, string>;
 
+  // Multimodal options
+  private readonly voice: boolean;
+  private readonly camera: boolean;
+  private readonly screen: boolean;
+  private readonly mcp: boolean;
+  private readonly mcpConfig: string;
+
   private process: ChildProcess | null = null;
   private readline: Interface | null = null;
   private state: DriverState = 'idle';
@@ -111,6 +133,38 @@ export class AgentDriver extends EventEmitter {
   private pendingConfirm: ((approved: boolean, message?: string) => void) | null = null;
   private pendingAnswer: ((text: string) => void) | null = null;
 
+  /**
+   * Load MCP server configuration from file.
+   * @param configPath - Path to MCP config file (supports ~ expansion)
+   * @returns Array of MCP server configs, or undefined if file doesn't exist
+   */
+  private loadMcpConfig(configPath: string): any[] | undefined {
+    try {
+      // Expand ~ to home directory
+      const expandedPath = configPath.startsWith('~')
+        ? resolve(homedir(), configPath.slice(2))
+        : resolve(configPath);
+
+      if (!existsSync(expandedPath)) {
+        return undefined;
+      }
+
+      const content = readFileSync(expandedPath, 'utf-8');
+      const config = JSON.parse(content);
+
+      // Convert config object to array of MCPServerConfig
+      return Object.entries(config).map(([name, serverConfig]: [string, any]) => ({
+        name,
+        command: serverConfig.command,
+        args: serverConfig.args || [],
+        env: serverConfig.env || {},
+      }));
+    } catch (error) {
+      // If config loading fails, return undefined (MCP will be disabled)
+      return undefined;
+    }
+  }
+
   constructor(options: DriverOptions = {}) {
     super();
 
@@ -123,6 +177,13 @@ export class AgentDriver extends EventEmitter {
     this.apiUrl = options.apiUrl ?? '';
     this.environmentType = options.environmentType ?? '';
     this.env = options.env ?? {};
+
+    // Multimodal options
+    this.voice = options.voice ?? false;
+    this.camera = options.camera ?? false;
+    this.screen = options.screen ?? false;
+    this.mcp = options.mcp ?? false;
+    this.mcpConfig = options.mcpConfig ?? '~/.agi/mcp.json';
   }
 
   /**
@@ -168,7 +229,7 @@ export class AgentDriver extends EventEmitter {
     screenshot: string = '',
     screenWidth: number = 0,
     screenHeight: number = 0,
-    mode?: string
+    mode?: '' | 'local' | 'remote'
   ): Promise<DriverResult> {
     if (this.process) {
       throw new Error('Driver is already running');
@@ -248,6 +309,15 @@ export class AgentDriver extends EventEmitter {
           agent_name: this.agentName || undefined,
           api_url: this.apiUrl || undefined,
           environment_type: this.environmentType || undefined,
+
+          // Multimodal options
+          audio_input_enabled: this.voice,
+          turn_detection_enabled: this.voice,
+          speech_output_enabled: this.voice,
+          speech_voice: this.voice ? 'alloy' : undefined,
+          camera_enabled: this.camera,
+          screen_recording_enabled: this.screen,
+          mcp_servers: this.mcp ? this.loadMcpConfig(this.mcpConfig) : undefined,
         };
         this.sendCommand(startCmd);
       });
@@ -464,6 +534,26 @@ export class AgentDriver extends EventEmitter {
         this.emit('session_created', event);
         break;
 
+      case 'audio_transcript':
+        this.emit('audio_transcript', event);
+        break;
+
+      case 'video_frame':
+        this.emit('video_frame', event);
+        break;
+
+      case 'speech_started':
+        this.emit('speech_started', event);
+        break;
+
+      case 'speech_finished':
+        this.emit('speech_finished', event);
+        break;
+
+      case 'turn_detected':
+        this.emit('turn_detected', event);
+        break;
+
       case 'finished':
         this.handleFinished(event);
         break;
diff --git a/src/driver/index.ts b/src/driver/index.ts
index c51431f..4909820 100644
--- a/src/driver/index.ts
+++ b/src/driver/index.ts
@@ -21,6 +21,11 @@ export {
   type ErrorEvent,
   type ScreenshotCapturedEvent,
   type SessionCreatedEvent,
+  type AudioTranscriptEvent,
+  type VideoFrameEvent,
+  type SpeechStartedEvent,
+  type SpeechFinishedEvent,
+  type TurnDetectedEvent,
   type StartCommand,
   type ScreenshotCommand,
   type PauseCommand,
@@ -28,6 +33,11 @@ export {
   type StopCommand,
   type ConfirmResponseCommand,
   type AnswerCommand,
+  type GetAudioTranscriptCommand,
+  type GetVideoFrameCommand,
+  type MCPServerConfig,
+  type AgentIdentity,
+  type ToolChoice,
 } from './protocol';
 
 export { findBinaryPath, isBinaryAvailable, getPlatformId, type PlatformId } from './binary';
diff --git a/src/driver/protocol.ts b/src/driver/protocol.ts
index c55c20e..c3feb98 100644
--- a/src/driver/protocol.ts
+++ b/src/driver/protocol.ts
@@ -17,7 +17,12 @@ export type EventType =
   | 'finished'
   | 'error'
   | 'screenshot_captured'
-  | 'session_created';
+  | 'session_created'
+  | 'audio_transcript'
+  | 'video_frame'
+  | 'speech_started'
+  | 'speech_finished'
+  | 'turn_detected';
 
 // Command types
 export type CommandType =
@@ -27,7 +32,9 @@ export type CommandType =
   | 'resume'
   | 'stop'
   | 'confirm'
-  | 'answer';
+  | 'answer'
+  | 'get_audio_transcript'
+  | 'get_video_frame';
 
 // Driver states
 export type DriverState =
@@ -114,6 +121,49 @@ export interface SessionCreatedEvent extends BaseEvent {
   vnc_url?: string;
 }
 
+/**
+ * Emitted when audio transcript is available.
+ */
+export interface AudioTranscriptEvent extends BaseEvent {
+  event: 'audio_transcript';
+  transcript: string;
+  seconds_ago: number;
+  duration: number;
+}
+
+/**
+ * Emitted when video frame is available.
+ */
+export interface VideoFrameEvent extends BaseEvent {
+  event: 'video_frame';
+  frame_base64: string;
+  source: 'camera' | 'screen';
+  seconds_ago: number;
+}
+
+/**
+ * Emitted when TTS speech starts playing.
+ */
+export interface SpeechStartedEvent extends BaseEvent {
+  event: 'speech_started';
+  text: string;
+}
+
+/**
+ * Emitted when TTS speech finishes playing.
+ */
+export interface SpeechFinishedEvent extends BaseEvent {
+  event: 'speech_finished';
+}
+
+/**
+ * Emitted when turn detection detects user has stopped speaking.
+ */
+export interface TurnDetectedEvent extends BaseEvent {
+  event: 'turn_detected';
+  transcript: string;
+}
+
 export type DriverEvent =
   | ReadyEvent
   | StateChangeEvent
@@ -124,7 +174,12 @@ export type DriverEvent =
   | FinishedEvent
   | ErrorEvent
   | ScreenshotCapturedEvent
-  | SessionCreatedEvent;
+  | SessionCreatedEvent
+  | AudioTranscriptEvent
+  | VideoFrameEvent
+  | SpeechStartedEvent
+  | SpeechFinishedEvent
+  | TurnDetectedEvent;
 
 // Action type from the driver
 export interface DriverAction {
@@ -134,6 +189,24 @@ export interface DriverAction {
   [key: string]: unknown;
 }
 
+// MCP server configuration
+export interface MCPServerConfig {
+  name: string;
+  command: string;
+  args: string[];
+  env?: Record<string, string>;
+}
+
+// Agent identity
+export interface AgentIdentity {
+  name: string;
+  creator: string;
+  creator_url: string;
+}
+
+// Tool choice configuration
+export type ToolChoice = 'auto' | 'required' | 'none' | { type: 'tool'; name: string };
+
 // Base command interface
 export interface BaseCommand {
   command: CommandType;
@@ -148,14 +221,25 @@ export interface StartCommand extends BaseCommand {
   screen_height: number;
   platform: 'desktop' | 'android';
   model: string;
-  /** "local" for autonomous mode, "remote" for managed VM, "" for legacy SDK-driven mode */
-  mode?: string;
-  /** Agent name for the AGI API (e.g., "agi-2-claude") */
+  mode: '' | 'local' | 'remote';
   agent_name?: string;
-  /** AGI API base URL (default: "https://api.agi.tech") */
   api_url?: string;
-  /** Environment type for remote mode ("ubuntu-1" or "chrome-1") */
   environment_type?: string;
+
+  // Multimodal features
+  agent_identity?: AgentIdentity;
+  tool_choice?: ToolChoice;
+  mcp_servers?: MCPServerConfig[];
+  audio_input_enabled?: boolean;
+  audio_buffer_seconds?: number;
+  turn_detection_enabled?: boolean;
+  turn_detection_silence_ms?: number;
+  speech_output_enabled?: boolean;
+  speech_voice?: 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer';
+  camera_enabled?: boolean;
+  camera_buffer_seconds?: number;
+  screen_recording_enabled?: boolean;
+  screen_recording_buffer_seconds?: number;
 }
 
 export interface ScreenshotCommand extends BaseCommand {
@@ -190,6 +274,18 @@ export interface AnswerCommand extends BaseCommand {
   question_id?: string;
 }
 
+export interface GetAudioTranscriptCommand extends BaseCommand {
+  command: 'get_audio_transcript';
+  seconds_ago: number;
+  duration: number;
+}
+
+export interface GetVideoFrameCommand extends BaseCommand {
+  command: 'get_video_frame';
+  source: 'camera' | 'screen';
+  seconds_ago: number;
+}
+
 export type DriverCommand =
   | StartCommand
   | ScreenshotCommand
@@ -197,7 +293,9 @@ export type DriverCommand =
   | ResumeCommand
   | StopCommand
   | ConfirmResponseCommand
-  | AnswerCommand;
+  | AnswerCommand
+  | GetAudioTranscriptCommand
+  | GetVideoFrameCommand;
 
 /**
  * Parse a JSON line into a DriverEvent.